diff --git a/.azure-pipelines/azure-pipelines-linux.yml b/.azure-pipelines/azure-pipelines-linux.yml index be1ea3f..6aedfe0 100755 --- a/.azure-pipelines/azure-pipelines-linux.yml +++ b/.azure-pipelines/azure-pipelines-linux.yml @@ -20,6 +20,10 @@ jobs: CONFIG: linux_64_c_compiler_version12cuda_compilerNonecuda_compiler_versionNonecxx_compiler_version12 UPLOAD_PACKAGES: 'True' DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cos7-x86_64 + linux_64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.0cxx_compiler_version12: + CONFIG: linux_64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.0cxx_compiler_version12 + UPLOAD_PACKAGES: 'True' + DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cos7-x86_64 linux_64_c_compiler_version7cuda_compilernvcccuda_compiler_version10.2cxx_compiler_version7: CONFIG: linux_64_c_compiler_version7cuda_compilernvcccuda_compiler_version10.2cxx_compiler_version7 UPLOAD_PACKAGES: 'True' diff --git a/.ci_support/linux_64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.0cxx_compiler_version12.yaml b/.ci_support/linux_64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.0cxx_compiler_version12.yaml new file mode 100644 index 0000000..eef5d92 --- /dev/null +++ b/.ci_support/linux_64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.0cxx_compiler_version12.yaml @@ -0,0 +1,66 @@ +boost_cpp: +- 1.78.0 +c_compiler: +- gcc +c_compiler_version: +- '12' +cdt_name: +- cos7 +channel_sources: +- conda-forge +channel_targets: +- conda-forge main +cuda_compiler: +- cuda-nvcc +cuda_compiler_version: +- '12.0' +cxx_compiler: +- gxx +cxx_compiler_version: +- '12' +docker_image: +- quay.io/condaforge/linux-anvil-cos7-x86_64 +flann: +- 1.9.1 +gflags: +- '2.2' +glew: +- '2.1' +glog: +- '0.6' +gmp: +- '6' +libblas: +- 3.9 *netlib +libcblas: +- 3.9 *netlib +libxcb: +- '1.15' +lz4_c: +- 1.9.3 +metis: +- '5.1' +pin_run_as_build: + boost-cpp: + max_pin: x.x.x + flann: + max_pin: x.x.x + vlfeat: + max_pin: x.x.x +qt_main: +- '5.15' +sqlite: +- '3' +suitesparse: +- '5' +target_platform: +- linux-64 +vlfeat: +- 0.9.21 +zip_keys: +- - c_compiler_version + - cxx_compiler_version + - cuda_compiler + - cuda_compiler_version + - cdt_name + - docker_image diff --git a/.ci_support/migrations/cuda120.yaml b/.ci_support/migrations/cuda120.yaml new file mode 100644 index 0000000..25f0f88 --- /dev/null +++ b/.ci_support/migrations/cuda120.yaml @@ -0,0 +1,72 @@ +migrator_ts: 1682985063 +__migrator: + kind: + version + migration_number: + 1 + build_number: + 1 + paused: false + override_cbc_keys: + - cuda_compiler_stub + operation: key_add + check_solvable: false + primary_key: cuda_compiler_version + ordering: + cxx_compiler_version: + - 9 + - 8 + - 7 + c_compiler_version: + - 9 + - 8 + - 7 + fortran_compiler_version: + - 9 + - 8 + - 7 + docker_image: + - quay.io/condaforge/linux-anvil-comp7 # [os.environ.get("BUILD_PLATFORM") == "linux-64"] + - quay.io/condaforge/linux-anvil-aarch64 # [os.environ.get("BUILD_PLATFORM") == "linux-aarch64"] + - quay.io/condaforge/linux-anvil-ppc64le # [os.environ.get("BUILD_PLATFORM") == "linux-ppc64le"] + - quay.io/condaforge/linux-anvil-armv7l # [os.environ.get("BUILD_PLATFORM") == "linux-armv7l"] + - quay.io/condaforge/linux-anvil-cuda:9.2 # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"] + - quay.io/condaforge/linux-anvil-cuda:10.0 # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"] + - quay.io/condaforge/linux-anvil-cuda:10.1 # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"] + - quay.io/condaforge/linux-anvil-cuda:10.2 # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"] + - quay.io/condaforge/linux-anvil-cuda:11.0 # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"] + - quay.io/condaforge/linux-anvil-cuda:11.1 # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"] + - quay.io/condaforge/linux-anvil-cuda:11.2 # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"] + - quay.io/condaforge/linux-anvil-cos7-x86_64 # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"] + cuda_compiler_version: + - None + - 10.2 # [(linux64 or win) and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + - 11.0 # [(linux64 or win) and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + - 11.1 # [(linux64 or win) and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + - 11.2 # [(linux64 or win) and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + - 12.0 # [(linux64 or win) and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + commit_message: "Rebuild for CUDA 12" + +cuda_compiler: # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + - cuda-nvcc # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + +cuda_compiler_version: # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + - 12.0 # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + +c_compiler_version: # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + - 12 # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + +cxx_compiler_version: # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + - 12 # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + +fortran_compiler_version: # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + - 12 # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + +cudnn: # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + - 8 # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + +cdt_name: # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + - cos7 # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + +docker_image: # [os.environ.get("BUILD_PLATFORM", "").startswith("linux-") and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] + - quay.io/condaforge/linux-anvil-cos7-x86_64 # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64" and os.environ.get("CF_CUDA_ENABLED", "False") == "True"] diff --git a/README.md b/README.md index dc83bb8..4ab6a92 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,13 @@ Current build status variant + + linux_64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.0cxx_compiler_version12 + + + variant + + linux_64_c_compiler_version7cuda_compilernvcccuda_compiler_version10.2cxx_compiler_version7 diff --git a/recipe/1809.patch b/recipe/1809.patch new file mode 100644 index 0000000..0958c97 --- /dev/null +++ b/recipe/1809.patch @@ -0,0 +1,1125 @@ +diff --git a/src/mvs/cuda_array_wrapper.h b/src/mvs/cuda_array_wrapper.h +deleted file mode 100644 +index e4e48b0e8..000000000 +--- a/src/mvs/cuda_array_wrapper.h ++++ /dev/null +@@ -1,171 +0,0 @@ +-// Copyright (c) 2023, ETH Zurich and UNC Chapel Hill. +-// All rights reserved. +-// +-// Redistribution and use in source and binary forms, with or without +-// modification, are permitted provided that the following conditions are met: +-// +-// * Redistributions of source code must retain the above copyright +-// notice, this list of conditions and the following disclaimer. +-// +-// * Redistributions in binary form must reproduce the above copyright +-// notice, this list of conditions and the following disclaimer in the +-// documentation and/or other materials provided with the distribution. +-// +-// * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of +-// its contributors may be used to endorse or promote products derived +-// from this software without specific prior written permission. +-// +-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE +-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +-// POSSIBILITY OF SUCH DAMAGE. +-// +-// Author: Johannes L. Schoenberger (jsch-at-demuc-dot-de) +- +-#ifndef COLMAP_SRC_MVS_CUDA_ARRAY_WRAPPER_H_ +-#define COLMAP_SRC_MVS_CUDA_ARRAY_WRAPPER_H_ +- +-#include +- +-#include +- +-#include "mvs/gpu_mat.h" +-#include "util/cudacc.h" +- +-namespace colmap { +-namespace mvs { +- +-template +-class CudaArrayWrapper { +- public: +- CudaArrayWrapper(const size_t width, const size_t height, const size_t depth); +- ~CudaArrayWrapper(); +- +- const cudaArray* GetPtr() const; +- cudaArray* GetPtr(); +- +- size_t GetWidth() const; +- size_t GetHeight() const; +- size_t GetDepth() const; +- +- void CopyToDevice(const T* data); +- void CopyToHost(const T* data); +- void CopyFromGpuMat(const GpuMat& array); +- +- private: +- // Define class as non-copyable and non-movable. +- CudaArrayWrapper(CudaArrayWrapper const&) = delete; +- void operator=(CudaArrayWrapper const& obj) = delete; +- CudaArrayWrapper(CudaArrayWrapper&&) = delete; +- +- void Allocate(); +- void Deallocate(); +- +- cudaArray* array_; +- +- size_t width_; +- size_t height_; +- size_t depth_; +-}; +- +-//////////////////////////////////////////////////////////////////////////////// +-// Implementation +-//////////////////////////////////////////////////////////////////////////////// +- +-template +-CudaArrayWrapper::CudaArrayWrapper(const size_t width, const size_t height, +- const size_t depth) +- : width_(width), height_(height), depth_(depth), array_(nullptr) {} +- +-template +-CudaArrayWrapper::~CudaArrayWrapper() { +- Deallocate(); +-} +- +-template +-const cudaArray* CudaArrayWrapper::GetPtr() const { +- return array_; +-} +- +-template +-cudaArray* CudaArrayWrapper::GetPtr() { +- return array_; +-} +- +-template +-size_t CudaArrayWrapper::GetWidth() const { +- return width_; +-} +- +-template +-size_t CudaArrayWrapper::GetHeight() const { +- return height_; +-} +- +-template +-size_t CudaArrayWrapper::GetDepth() const { +- return depth_; +-} +- +-template +-void CudaArrayWrapper::CopyToDevice(const T* data) { +- cudaMemcpy3DParms params = {0}; +- Allocate(); +- params.extent = make_cudaExtent(width_, height_, depth_); +- params.kind = cudaMemcpyHostToDevice; +- params.dstArray = array_; +- params.srcPtr = +- make_cudaPitchedPtr((void*)data, width_ * sizeof(T), width_, height_); +- CUDA_SAFE_CALL(cudaMemcpy3D(¶ms)); +-} +- +-template +-void CudaArrayWrapper::CopyToHost(const T* data) { +- cudaMemcpy3DParms params = {0}; +- params.extent = make_cudaExtent(width_, height_, depth_); +- params.kind = cudaMemcpyDeviceToHost; +- params.dstPtr = +- make_cudaPitchedPtr((void*)data, width_ * sizeof(T), width_, height_); +- params.srcArray = array_; +- CUDA_SAFE_CALL(cudaMemcpy3D(¶ms)); +-} +- +-template +-void CudaArrayWrapper::CopyFromGpuMat(const GpuMat& array) { +- Allocate(); +- cudaMemcpy3DParms parameters = {0}; +- parameters.extent = make_cudaExtent(width_, height_, depth_); +- parameters.kind = cudaMemcpyDeviceToDevice; +- parameters.dstArray = array_; +- parameters.srcPtr = make_cudaPitchedPtr((void*)array.GetPtr(), +- array.GetPitch(), width_, height_); +- CUDA_SAFE_CALL(cudaMemcpy3D(¶meters)); +-} +- +-template +-void CudaArrayWrapper::Allocate() { +- Deallocate(); +- struct cudaExtent extent = make_cudaExtent(width_, height_, depth_); +- cudaChannelFormatDesc fmt = cudaCreateChannelDesc(); +- CUDA_SAFE_CALL(cudaMalloc3DArray(&array_, &fmt, extent, cudaArrayLayered)); +-} +- +-template +-void CudaArrayWrapper::Deallocate() { +- if (array_ != nullptr) { +- CUDA_SAFE_CALL(cudaFreeArray(array_)); +- array_ = nullptr; +- } +-} +- +-} // namespace mvs +-} // namespace colmap +- +-#endif // COLMAP_SRC_MVS_CUDA_ARRAY_WRAPPER_H_ +diff --git a/src/mvs/cuda_texture.h b/src/mvs/cuda_texture.h +new file mode 100644 +index 000000000..3dcd8d171 +--- /dev/null ++++ b/src/mvs/cuda_texture.h +@@ -0,0 +1,180 @@ ++// Copyright (c) 2023, ETH Zurich and UNC Chapel Hill. ++// All rights reserved. ++// ++// Redistribution and use in source and binary forms, with or without ++// modification, are permitted provided that the following conditions are met: ++// ++// * Redistributions of source code must retain the above copyright ++// notice, this list of conditions and the following disclaimer. ++// ++// * Redistributions in binary form must reproduce the above copyright ++// notice, this list of conditions and the following disclaimer in the ++// documentation and/or other materials provided with the distribution. ++// ++// * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of ++// its contributors may be used to endorse or promote products derived ++// from this software without specific prior written permission. ++// ++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE ++// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS ++// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN ++// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ++// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ++// POSSIBILITY OF SUCH DAMAGE. ++// ++// Author: Johannes L. Schoenberger (jsch-at-demuc-dot-de) ++ ++#ifndef COLMAP_SRC_MVS_CUDA_TEXTURE_H_ ++#define COLMAP_SRC_MVS_CUDA_TEXTURE_H_ ++ ++#include ++ ++#include ++ ++#include "mvs/gpu_mat.h" ++#include "util/cudacc.h" ++#include "util/logging.h" ++ ++namespace colmap { ++namespace mvs { ++ ++template ++class CudaArrayLayeredTexture { ++ public: ++ static std::unique_ptr> FromGpuMat( ++ const cudaTextureDesc& texture_desc, const GpuMat& mat); ++ static std::unique_ptr> FromHostArray( ++ const cudaTextureDesc& texture_desc, const size_t width, ++ const size_t height, const size_t depth, const T* data); ++ ++ cudaTextureObject_t GetObj() const; ++ ++ size_t GetWidth() const; ++ size_t GetHeight() const; ++ size_t GetDepth() const; ++ ++ CudaArrayLayeredTexture(const cudaTextureDesc& texture_desc, ++ const size_t width, const size_t height, ++ const size_t depth); ++ ~CudaArrayLayeredTexture(); ++ ++ private: ++ // Define class as non-copyable and non-movable. ++ CudaArrayLayeredTexture(CudaArrayLayeredTexture const&) = delete; ++ void operator=(CudaArrayLayeredTexture const& obj) = delete; ++ CudaArrayLayeredTexture(CudaArrayLayeredTexture&&) = delete; ++ ++ const size_t width_; ++ const size_t height_; ++ const size_t depth_; ++ ++ cudaArray_t array_; ++ const cudaTextureDesc texture_desc_; ++ cudaResourceDesc resource_desc_; ++ cudaTextureObject_t texture_; ++}; ++ ++//////////////////////////////////////////////////////////////////////////////// ++// Implementation ++//////////////////////////////////////////////////////////////////////////////// ++ ++template ++std::unique_ptr> ++CudaArrayLayeredTexture::FromGpuMat(const cudaTextureDesc& texture_desc, ++ const GpuMat& mat) { ++ auto array = std::make_unique>( ++ texture_desc, mat.GetWidth(), mat.GetHeight(), mat.GetDepth()); ++ ++ cudaMemcpy3DParms params; ++ memset(¶ms, 0, sizeof(params)); ++ params.extent = ++ make_cudaExtent(mat.GetWidth(), mat.GetHeight(), mat.GetDepth()); ++ params.kind = cudaMemcpyDeviceToDevice; ++ params.srcPtr = make_cudaPitchedPtr((void*)mat.GetPtr(), mat.GetPitch(), ++ mat.GetWidth(), mat.GetHeight()); ++ params.dstArray = array->array_; ++ CUDA_SAFE_CALL(cudaMemcpy3D(¶ms)); ++ ++ return array; ++} ++ ++template ++std::unique_ptr> ++CudaArrayLayeredTexture::FromHostArray(const cudaTextureDesc& texture_desc, ++ const size_t width, ++ const size_t height, ++ const size_t depth, const T* data) { ++ auto array = std::make_unique>(texture_desc, width, ++ height, depth); ++ ++ cudaMemcpy3DParms params; ++ memset(¶ms, 0, sizeof(params)); ++ params.extent = make_cudaExtent(width, height, depth); ++ params.kind = cudaMemcpyHostToDevice; ++ params.srcPtr = ++ make_cudaPitchedPtr((void*)data, width * sizeof(T), width, height); ++ params.dstArray = array->array_; ++ CUDA_SAFE_CALL(cudaMemcpy3D(¶ms)); ++ ++ return array; ++} ++ ++template ++CudaArrayLayeredTexture::CudaArrayLayeredTexture( ++ const cudaTextureDesc& texture_desc, const size_t width, ++ const size_t height, const size_t depth) ++ : texture_desc_(texture_desc), ++ width_(width), ++ height_(height), ++ depth_(depth) { ++ CHECK_GT(width_, 0); ++ CHECK_GT(height_, 0); ++ CHECK_GT(depth_, 0); ++ ++ cudaExtent extent = make_cudaExtent(width_, height_, depth_); ++ cudaChannelFormatDesc fmt = cudaCreateChannelDesc(); ++ CUDA_SAFE_CALL(cudaMalloc3DArray(&array_, &fmt, extent, cudaArrayLayered)); ++ ++ memset(&resource_desc_, 0, sizeof(resource_desc_)); ++ resource_desc_.resType = cudaResourceTypeArray; ++ resource_desc_.res.array.array = array_; ++ ++ CUDA_SAFE_CALL(cudaCreateTextureObject(&texture_, &resource_desc_, ++ &texture_desc_, nullptr)); ++} ++ ++template ++CudaArrayLayeredTexture::~CudaArrayLayeredTexture() { ++ CUDA_SAFE_CALL(cudaFreeArray(array_)); ++ CUDA_SAFE_CALL(cudaDestroyTextureObject(texture_)); ++} ++ ++template ++cudaTextureObject_t CudaArrayLayeredTexture::GetObj() const { ++ return texture_; ++} ++ ++template ++size_t CudaArrayLayeredTexture::GetWidth() const { ++ return width_; ++} ++ ++template ++size_t CudaArrayLayeredTexture::GetHeight() const { ++ return height_; ++} ++ ++template ++size_t CudaArrayLayeredTexture::GetDepth() const { ++ return depth_; ++} ++ ++} // namespace mvs ++} // namespace colmap ++ ++#endif // COLMAP_SRC_MVS_CUDA_TEXTURE_H_ +diff --git a/src/mvs/gpu_mat_ref_image.cu b/src/mvs/gpu_mat_ref_image.cu +index c40a10bc3..1e3cc9d5e 100644 +--- a/src/mvs/gpu_mat_ref_image.cu ++++ b/src/mvs/gpu_mat_ref_image.cu +@@ -39,9 +39,8 @@ namespace colmap { + namespace mvs { + namespace { + +-texture image_texture; +- +-__global__ void FilterKernel(GpuMat image, GpuMat sum_image, ++__global__ void FilterKernel(const cudaTextureObject_t image_texture, ++ GpuMat image, GpuMat sum_image, + GpuMat squared_sum_image, + const int window_radius, const int window_step, + const float sigma_spatial, +@@ -54,7 +53,7 @@ __global__ void FilterKernel(GpuMat image, GpuMat sum_image, + + BilateralWeightComputer bilateral_weight_computer(sigma_spatial, sigma_color); + +- const float center_color = tex2D(image_texture, col, row); ++ const float center_color = tex2D(image_texture, col, row); + + float color_sum = 0.0f; + float color_squared_sum = 0.0f; +@@ -65,7 +64,7 @@ __global__ void FilterKernel(GpuMat image, GpuMat sum_image, + for (int window_col = -window_radius; window_col <= window_radius; + window_col += window_step) { + const float color = +- tex2D(image_texture, col + window_col, row + window_row); ++ tex2D(image_texture, col + window_col, row + window_row); + const float bilateral_weight = bilateral_weight_computer.Compute( + window_row, window_col, center_color, color); + color_sum += bilateral_weight * color; +@@ -95,24 +94,25 @@ void GpuMatRefImage::Filter(const uint8_t* image_data, + const size_t window_radius, + const size_t window_step, const float sigma_spatial, + const float sigma_color) { +- CudaArrayWrapper image_array(width_, height_, 1); +- image_array.CopyToDevice(image_data); +- image_texture.addressMode[0] = cudaAddressModeBorder; +- image_texture.addressMode[1] = cudaAddressModeBorder; +- image_texture.addressMode[2] = cudaAddressModeBorder; +- image_texture.filterMode = cudaFilterModePoint; +- image_texture.normalized = false; ++ cudaTextureDesc texture_desc; ++ memset(&texture_desc, 0, sizeof(texture_desc)); ++ texture_desc.addressMode[0] = cudaAddressModeBorder; ++ texture_desc.addressMode[1] = cudaAddressModeBorder; ++ texture_desc.addressMode[2] = cudaAddressModeBorder; ++ texture_desc.filterMode = cudaFilterModePoint; ++ texture_desc.readMode = cudaReadModeNormalizedFloat; ++ texture_desc.normalizedCoords = false; ++ auto image_texture = CudaArrayLayeredTexture::FromHostArray( ++ texture_desc, width_, height_, 1, image_data); + + const dim3 block_size(kBlockDimX, kBlockDimY); + const dim3 grid_size((width_ - 1) / block_size.x + 1, + (height_ - 1) / block_size.y + 1); + +- CUDA_SAFE_CALL(cudaBindTextureToArray(image_texture, image_array.GetPtr())); + FilterKernel<<>>( +- *image, *sum_image, *squared_sum_image, window_radius, window_step, +- sigma_spatial, sigma_color); ++ image_texture->GetObj(), *image, *sum_image, *squared_sum_image, ++ window_radius, window_step, sigma_spatial, sigma_color); + CUDA_SYNC_AND_CHECK(); +- CUDA_SAFE_CALL(cudaUnbindTexture(image_texture)); + } + + } // namespace mvs +diff --git a/src/mvs/gpu_mat_ref_image.h b/src/mvs/gpu_mat_ref_image.h +index 1e04e5f43..4f4be34e6 100644 +--- a/src/mvs/gpu_mat_ref_image.h ++++ b/src/mvs/gpu_mat_ref_image.h +@@ -34,7 +34,7 @@ + + #include + +-#include "mvs/cuda_array_wrapper.h" ++#include "mvs/cuda_texture.h" + #include "mvs/gpu_mat.h" + + namespace colmap { +@@ -64,8 +64,8 @@ class GpuMatRefImage { + const static size_t kBlockDimX = 16; + const static size_t kBlockDimY = 12; + +- size_t width_; +- size_t height_; ++ const size_t width_; ++ const size_t height_; + }; + + struct BilateralWeightComputer { +diff --git a/src/mvs/patch_match_cuda.cu b/src/mvs/patch_match_cuda.cu +index 845fffa94..772f341a1 100644 +--- a/src/mvs/patch_match_cuda.cu ++++ b/src/mvs/patch_match_cuda.cu +@@ -56,14 +56,6 @@ + namespace colmap { + namespace mvs { + +-texture +- ref_image_texture; +-texture +- src_images_texture; +-texture +- src_depth_maps_texture; +-texture poses_texture; +- + // Calibration of reference image as {fx, cx, fy, cy}. + __constant__ float ref_K[4]; + // Calibration of reference image as {1/fx, -cx/fx, 1/fy, -cy/fy}. +@@ -229,18 +221,17 @@ __device__ inline float PropagateDepth(const float depth1, + // First, compute triangulation angle between reference and source image for 3D + // point. Second, compute incident angle between viewing direction of source + // image and normal direction of 3D point. Both angles are cosine distances. +-__device__ inline void ComputeViewingAngles(const float point[3], +- const float normal[3], +- const int image_idx, +- float* cos_triangulation_angle, +- float* cos_incident_angle) { ++__device__ inline void ComputeViewingAngles( ++ const cudaTextureObject_t poses_texture, const float point[3], ++ const float normal[3], const int image_idx, float* cos_triangulation_angle, ++ float* cos_incident_angle) { + *cos_triangulation_angle = 0.0f; + *cos_incident_angle = 0.0f; + + // Projection center of source image. + float C[3]; + for (int i = 0; i < 3; ++i) { +- C[i] = tex2D(poses_texture, i + 16, image_idx); ++ C[i] = tex2D(poses_texture, i + 16, image_idx); + } + + // Ray from point to camera. +@@ -256,25 +247,25 @@ __device__ inline void ComputeViewingAngles(const float point[3], + *cos_triangulation_angle = DotProduct3(SX, point) * RX_inv_norm * SX_inv_norm; + } + +-__device__ inline void ComposeHomography(const int image_idx, const int row, +- const int col, const float depth, +- const float normal[3], float H[9]) { ++__device__ inline void ComposeHomography( ++ const cudaTextureObject_t poses_texture, const int image_idx, const int row, ++ const int col, const float depth, const float normal[3], float H[9]) { + // Calibration of source image. + float K[4]; + for (int i = 0; i < 4; ++i) { +- K[i] = tex2D(poses_texture, i, image_idx); ++ K[i] = tex2D(poses_texture, i, image_idx); + } + + // Relative rotation between reference and source image. + float R[9]; + for (int i = 0; i < 9; ++i) { +- R[i] = tex2D(poses_texture, i + 4, image_idx); ++ R[i] = tex2D(poses_texture, i + 4, image_idx); + } + + // Relative translation between reference and source image. + float T[3]; + for (int i = 0; i < 3; ++i) { +- T[i] = tex2D(poses_texture, i + 13, image_idx); ++ T[i] = tex2D(poses_texture, i + 13, image_idx); + } + + // Distance to the plane. +@@ -332,6 +323,9 @@ struct LocalRefImage { + const static int kNumColumns = kThreadBlockSize * THREADS_PER_BLOCK; + const static int kDataSize = kNumRows * kNumColumns; + ++ __device__ explicit LocalRefImage(const cudaTextureObject_t ref_image_texture) ++ : ref_image_texture_(ref_image_texture) {} ++ + float* data = nullptr; + + __device__ inline void Read(const int row) { +@@ -357,7 +351,7 @@ struct LocalRefImage { + #pragma unroll + for (int block = 0; block < kThreadBlockSize; ++block) { + data[local_row * kNumColumns + local_col] = +- tex2D(ref_image_texture, global_col, global_row); ++ tex2D(ref_image_texture_, global_col, global_row); + local_col += THREADS_PER_BLOCK; + global_col += THREADS_PER_BLOCK; + } +@@ -382,12 +376,15 @@ struct LocalRefImage { + #pragma unroll + for (int block = 0; block < kThreadBlockSize; ++block) { + data[local_row * kNumColumns + local_col] = +- tex2D(ref_image_texture, global_col, global_row); ++ tex2D(ref_image_texture_, global_col, global_row); + local_col += THREADS_PER_BLOCK; + global_col += THREADS_PER_BLOCK; + } + } + } ++ ++ private: ++ const cudaTextureObject_t ref_image_texture_; + }; + + // The return values is 1 - NCC, so the range is [0, 2], the smaller the +@@ -396,9 +393,15 @@ template + struct PhotoConsistencyCostComputer { + const static int kWindowRadius = kWindowSize / 2; + +- __device__ PhotoConsistencyCostComputer(const float sigma_spatial, +- const float sigma_color) +- : bilateral_weight_computer_(sigma_spatial, sigma_color) {} ++ __device__ PhotoConsistencyCostComputer( ++ const cudaTextureObject_t ref_image_texture, ++ const cudaTextureObject_t src_images_texture, ++ const cudaTextureObject_t poses_texture, const float sigma_spatial, ++ const float sigma_color) ++ : local_ref_image(ref_image_texture), ++ src_images_texture_(src_images_texture), ++ poses_texture_(poses_texture), ++ bilateral_weight_computer_(sigma_spatial, sigma_color) {} + + // Maximum photo consistency cost as 1 - min(NCC). + const float kMaxCost = 2.0f; +@@ -429,7 +432,8 @@ struct PhotoConsistencyCostComputer { + + __device__ inline float Compute() const { + float tform[9]; +- ComposeHomography(src_image_idx, row, col, depth, normal, tform); ++ ComposeHomography(poses_texture_, src_image_idx, row, col, depth, normal, ++ tform); + + float tform_step[8]; + for (int i = 0; i < 8; ++i) { +@@ -467,8 +471,8 @@ struct PhotoConsistencyCostComputer { + const float norm_col_src = inv_z * col_src + 0.5f; + const float norm_row_src = inv_z * row_src + 0.5f; + const float ref_color = local_ref_image.data[ref_image_idx]; +- const float src_color = tex2DLayered(src_images_texture, norm_col_src, +- norm_row_src, src_image_idx); ++ const float src_color = tex2DLayered( ++ src_images_texture_, norm_col_src, norm_row_src, src_image_idx); + + const float bilateral_weight = bilateral_weight_computer_.Compute( + row, col, ref_center_color, ref_color); +@@ -528,22 +532,24 @@ struct PhotoConsistencyCostComputer { + } + + private: ++ const cudaTextureObject_t src_images_texture_; ++ const cudaTextureObject_t poses_texture_; + const BilateralWeightComputer bilateral_weight_computer_; + }; + +-__device__ inline float ComputeGeomConsistencyCost(const float row, +- const float col, +- const float depth, +- const int image_idx, +- const float max_cost) { ++__device__ inline float ComputeGeomConsistencyCost( ++ const cudaTextureObject_t poses_texture, ++ const cudaTextureObject_t src_depth_maps_texture, const float row, ++ const float col, const float depth, const int image_idx, ++ const float max_cost) { + // Extract projection matrices for source image. + float P[12]; + for (int i = 0; i < 12; ++i) { +- P[i] = tex2D(poses_texture, i + 19, image_idx); ++ P[i] = tex2D(poses_texture, i + 19, image_idx); + } + float inv_P[12]; + for (int i = 0; i < 12; ++i) { +- inv_P[i] = tex2D(poses_texture, i + 31, image_idx); ++ inv_P[i] = tex2D(poses_texture, i + 31, image_idx); + } + + // Project point in reference image to world. +@@ -562,8 +568,8 @@ __device__ inline float ComputeGeomConsistencyCost(const float row, + P[6] * forward_point[2] + P[7]); + + // Extract depth in source image. +- const float src_depth = tex2DLayered(src_depth_maps_texture, src_col + 0.5f, +- src_row + 0.5f, image_idx); ++ const float src_depth = tex2DLayered( ++ src_depth_maps_texture, src_col + 0.5f, src_row + 0.5f, image_idx); + + // Projection outside of source image. + if (src_depth == 0.0f) { +@@ -794,15 +800,20 @@ template + __global__ void ComputeInitialCost(GpuMat cost_map, + const GpuMat depth_map, + const GpuMat normal_map, ++ const cudaTextureObject_t ref_image_texture, + const GpuMat ref_sum_image, + const GpuMat ref_squared_sum_image, ++ const cudaTextureObject_t src_images_texture, ++ const cudaTextureObject_t poses_texture, + const float sigma_spatial, + const float sigma_color) { + const int col = blockDim.x * blockIdx.x + threadIdx.x; + + typedef PhotoConsistencyCostComputer + PhotoConsistencyCostComputerType; +- PhotoConsistencyCostComputerType pcc_computer(sigma_spatial, sigma_color); ++ PhotoConsistencyCostComputerType pcc_computer( ++ ref_image_texture, src_images_texture, poses_texture, sigma_spatial, ++ sigma_color); + pcc_computer.col = col; + + __shared__ float local_ref_image_data +@@ -859,8 +870,13 @@ __global__ void SweepFromTopToBottom( + GpuMat global_workspace, GpuMat rand_state_map, + GpuMat cost_map, GpuMat depth_map, GpuMat normal_map, + GpuMat consistency_mask, GpuMat sel_prob_map, +- const GpuMat prev_sel_prob_map, const GpuMat ref_sum_image, +- const GpuMat ref_squared_sum_image, const SweepOptions options) { ++ const GpuMat prev_sel_prob_map, ++ const cudaTextureObject_t ref_image_texture, ++ const GpuMat ref_sum_image, ++ const GpuMat ref_squared_sum_image, ++ const cudaTextureObject_t src_images_texture, ++ const cudaTextureObject_t src_depth_maps_texture, ++ const cudaTextureObject_t poses_texture, const SweepOptions options) { + const int col = blockDim.x * blockIdx.x + threadIdx.x; + + // Probability for boundary pixels. +@@ -904,8 +920,9 @@ __global__ void SweepFromTopToBottom( + + typedef PhotoConsistencyCostComputer + PhotoConsistencyCostComputerType; +- PhotoConsistencyCostComputerType pcc_computer(options.sigma_spatial, +- options.sigma_color); ++ PhotoConsistencyCostComputerType pcc_computer( ++ ref_image_texture, src_images_texture, poses_texture, ++ options.sigma_spatial, options.sigma_color); + pcc_computer.col = col; + + __shared__ float local_ref_image_data +@@ -982,16 +999,17 @@ __global__ void SweepFromTopToBottom( + + float cos_triangulation_angle; + float cos_incident_angle; +- ComputeViewingAngles(point, curr_param_state.normal, image_idx, +- &cos_triangulation_angle, &cos_incident_angle); ++ ComputeViewingAngles(poses_texture, point, curr_param_state.normal, ++ image_idx, &cos_triangulation_angle, ++ &cos_incident_angle); + const float tri_prob = + likelihood_computer.ComputeTriProb(cos_triangulation_angle); + const float inc_prob = + likelihood_computer.ComputeIncProb(cos_incident_angle); + + float H[9]; +- ComposeHomography(image_idx, row, col, curr_param_state.depth, +- curr_param_state.normal, H); ++ ComposeHomography(poses_texture, image_idx, row, col, ++ curr_param_state.depth, curr_param_state.normal, H); + const float res_prob = + likelihood_computer.ComputeResolutionProb(H, row, col); + +@@ -1035,10 +1053,11 @@ __global__ void SweepFromTopToBottom( + + costs[0] += cost_map.Get(row, col, pcc_computer.src_image_idx); + if (kGeomConsistencyTerm) { +- costs[0] += options.geom_consistency_regularizer * +- ComputeGeomConsistencyCost( +- row, col, depths[0], pcc_computer.src_image_idx, +- options.geom_consistency_max_cost); ++ costs[0] += ++ options.geom_consistency_regularizer * ++ ComputeGeomConsistencyCost( ++ poses_texture, src_depth_maps_texture, row, col, depths[0], ++ pcc_computer.src_image_idx, options.geom_consistency_max_cost); + } + + for (int i = 1; i < kNumCosts; ++i) { +@@ -1048,7 +1067,8 @@ __global__ void SweepFromTopToBottom( + if (kGeomConsistencyTerm) { + costs[i] += options.geom_consistency_regularizer * + ComputeGeomConsistencyCost( +- row, col, depths[i], pcc_computer.src_image_idx, ++ poses_texture, src_depth_maps_texture, row, col, ++ depths[i], pcc_computer.src_image_idx, + options.geom_consistency_max_cost); + } + } +@@ -1102,7 +1122,7 @@ __global__ void SweepFromTopToBottom( + for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) { + float cos_triangulation_angle; + float cos_incident_angle; +- ComputeViewingAngles(best_point, best_normal, image_idx, ++ ComputeViewingAngles(poses_texture, best_point, best_normal, image_idx, + &cos_triangulation_angle, &cos_incident_angle); + if (cos_triangulation_angle > cos_min_triangulation_angle || + cos_incident_angle <= 0.0f) { +@@ -1115,7 +1135,8 @@ __global__ void SweepFromTopToBottom( + num_consistent += 1; + } + } else if (!kFilterPhotoConsistency) { +- if (ComputeGeomConsistencyCost(row, col, best_depth, image_idx, ++ if (ComputeGeomConsistencyCost(poses_texture, src_depth_maps_texture, ++ row, col, best_depth, image_idx, + options.geom_consistency_max_cost) <= + options.filter_geom_consistency_max_cost) { + consistency_mask.Set(row, col, image_idx, 1); +@@ -1123,7 +1144,8 @@ __global__ void SweepFromTopToBottom( + } + } else { + if (sel_prob_map.Get(row, col, image_idx) >= min_ncc_prob && +- ComputeGeomConsistencyCost(row, col, best_depth, image_idx, ++ ComputeGeomConsistencyCost(poses_texture, src_depth_maps_texture, ++ row, col, best_depth, image_idx, + options.geom_consistency_max_cost) <= + options.filter_geom_consistency_max_cost) { + consistency_mask.Set(row, col, image_idx, 1); +@@ -1169,53 +1191,49 @@ PatchMatchCuda::PatchMatchCuda(const PatchMatchOptions& options, + InitWorkspaceMemory(); + } + +-PatchMatchCuda::~PatchMatchCuda() { +- for (size_t i = 0; i < 4; ++i) { +- poses_device_[i].reset(); +- } +-} +- + void PatchMatchCuda::Run() { + #define CASE_WINDOW_RADIUS(window_radius, window_step) \ + case window_radius: \ + RunWithWindowSizeAndStep<2 * window_radius + 1, window_step>(); \ + break; + +-#define CASE_WINDOW_STEP(window_step) \ +- case window_step: \ +- switch (options_.window_radius) { \ +- CASE_WINDOW_RADIUS(1, window_step) \ +- CASE_WINDOW_RADIUS(2, window_step) \ +- CASE_WINDOW_RADIUS(3, window_step) \ +- CASE_WINDOW_RADIUS(4, window_step) \ +- CASE_WINDOW_RADIUS(5, window_step) \ +- CASE_WINDOW_RADIUS(6, window_step) \ +- CASE_WINDOW_RADIUS(7, window_step) \ +- CASE_WINDOW_RADIUS(8, window_step) \ +- CASE_WINDOW_RADIUS(9, window_step) \ +- CASE_WINDOW_RADIUS(10, window_step) \ +- CASE_WINDOW_RADIUS(11, window_step) \ +- CASE_WINDOW_RADIUS(12, window_step) \ +- CASE_WINDOW_RADIUS(13, window_step) \ +- CASE_WINDOW_RADIUS(14, window_step) \ +- CASE_WINDOW_RADIUS(15, window_step) \ +- CASE_WINDOW_RADIUS(16, window_step) \ +- CASE_WINDOW_RADIUS(17, window_step) \ +- CASE_WINDOW_RADIUS(18, window_step) \ +- CASE_WINDOW_RADIUS(19, window_step) \ +- CASE_WINDOW_RADIUS(20, window_step) \ +- default: { \ +- std::cerr << "Error: Window size not supported" << std::endl; \ +- break; \ +- } \ +- } \ ++#define CASE_WINDOW_STEP(window_step) \ ++ case window_step: \ ++ switch (options_.window_radius) { \ ++ CASE_WINDOW_RADIUS(1, window_step) \ ++ CASE_WINDOW_RADIUS(2, window_step) \ ++ CASE_WINDOW_RADIUS(3, window_step) \ ++ CASE_WINDOW_RADIUS(4, window_step) \ ++ CASE_WINDOW_RADIUS(5, window_step) \ ++ CASE_WINDOW_RADIUS(6, window_step) \ ++ CASE_WINDOW_RADIUS(7, window_step) \ ++ CASE_WINDOW_RADIUS(8, window_step) \ ++ CASE_WINDOW_RADIUS(9, window_step) \ ++ CASE_WINDOW_RADIUS(10, window_step) \ ++ CASE_WINDOW_RADIUS(11, window_step) \ ++ CASE_WINDOW_RADIUS(12, window_step) \ ++ CASE_WINDOW_RADIUS(13, window_step) \ ++ CASE_WINDOW_RADIUS(14, window_step) \ ++ CASE_WINDOW_RADIUS(15, window_step) \ ++ CASE_WINDOW_RADIUS(16, window_step) \ ++ CASE_WINDOW_RADIUS(17, window_step) \ ++ CASE_WINDOW_RADIUS(18, window_step) \ ++ CASE_WINDOW_RADIUS(19, window_step) \ ++ CASE_WINDOW_RADIUS(20, window_step) \ ++ default: { \ ++ std::cerr << "Error: Window size " << options_.window_radius \ ++ << " not supported" << std::endl; \ ++ break; \ ++ } \ ++ } \ + break; + + switch (options_.window_step) { + CASE_WINDOW_STEP(1) + CASE_WINDOW_STEP(2) + default: { +- std::cerr << "Error: Window step not supported" << std::endl; ++ std::cerr << "Error: Window step " << options_.window_step ++ << " not supported" << std::endl; + break; + } + } +@@ -1274,9 +1292,10 @@ void PatchMatchCuda::RunWithWindowSizeAndStep() { + ComputeCudaConfig(); + ComputeInitialCost + <<>>( +- *cost_map_, *depth_map_, *normal_map_, *ref_image_->sum_image, +- *ref_image_->squared_sum_image, options_.sigma_spatial, +- options_.sigma_color); ++ *cost_map_, *depth_map_, *normal_map_, ref_image_texture_->GetObj(), ++ *ref_image_->sum_image, *ref_image_->squared_sum_image, ++ src_images_texture_->GetObj(), poses_texture_[0]->GetObj(), ++ options_.sigma_spatial, options_.sigma_color); + CUDA_SYNC_AND_CHECK(); + + init_timer.Print("Initialization"); +@@ -1324,8 +1343,13 @@ void PatchMatchCuda::RunWithWindowSizeAndStep() { + <<>>( \ + *global_workspace_, *rand_state_map_, *cost_map_, *depth_map_, \ + *normal_map_, *consistency_mask_, *sel_prob_map_, \ +- *prev_sel_prob_map_, *ref_image_->sum_image, \ +- *ref_image_->squared_sum_image, sweep_options); ++ *prev_sel_prob_map_, ref_image_texture_->GetObj(), \ ++ *ref_image_->sum_image, *ref_image_->squared_sum_image, \ ++ src_images_texture_->GetObj(), \ ++ src_depth_maps_texture_ == nullptr \ ++ ? 0 \ ++ : src_depth_maps_texture_->GetObj(), \ ++ poses_texture_[rotation_in_half_pi_]->GetObj(), sweep_options); + + if (last_sweep) { + if (options_.filter) { +@@ -1410,13 +1434,26 @@ void PatchMatchCuda::ComputeCudaConfig() { + elem_wise_grid_size_.z = 1; + } + ++void PatchMatchCuda::BindRefImageTexture() { ++ cudaTextureDesc texture_desc; ++ memset(&texture_desc, 0, sizeof(texture_desc)); ++ texture_desc.addressMode[0] = cudaAddressModeBorder; ++ texture_desc.addressMode[1] = cudaAddressModeBorder; ++ texture_desc.addressMode[2] = cudaAddressModeBorder; ++ texture_desc.filterMode = cudaFilterModePoint; ++ texture_desc.readMode = cudaReadModeNormalizedFloat; ++ texture_desc.normalizedCoords = false; ++ ref_image_texture_ = CudaArrayLayeredTexture::FromGpuMat( ++ texture_desc, *ref_image_->image); ++} ++ + void PatchMatchCuda::InitRefImage() { + const Image& ref_image = problem_.images->at(problem_.ref_image_idx); + + ref_width_ = ref_image.GetWidth(); + ref_height_ = ref_image.GetHeight(); + +- // Upload to device. ++ // Upload to device and filter. + ref_image_.reset(new GpuMatRefImage(ref_width_, ref_height_)); + const std::vector ref_image_array = + ref_image.GetBitmap().ConvertToRowMajorArray(); +@@ -1424,18 +1461,7 @@ void PatchMatchCuda::InitRefImage() { + options_.window_step, options_.sigma_spatial, + options_.sigma_color); + +- ref_image_device_.reset( +- new CudaArrayWrapper(ref_width_, ref_height_, 1)); +- ref_image_device_->CopyFromGpuMat(*ref_image_->image); +- +- // Create texture. +- ref_image_texture.addressMode[0] = cudaAddressModeBorder; +- ref_image_texture.addressMode[1] = cudaAddressModeBorder; +- ref_image_texture.addressMode[2] = cudaAddressModeBorder; +- ref_image_texture.filterMode = cudaFilterModePoint; +- ref_image_texture.normalized = false; +- CUDA_SAFE_CALL( +- cudaBindTextureToArray(ref_image_texture, ref_image_device_->GetPtr())); ++ BindRefImageTexture(); + } + + void PatchMatchCuda::InitSourceImages() { +@@ -1470,19 +1496,18 @@ void PatchMatchCuda::InitSourceImages() { + } + } + +- // Upload to device. +- src_images_device_.reset(new CudaArrayWrapper( +- max_width, max_height, problem_.src_image_idxs.size())); +- src_images_device_->CopyToDevice(src_images_host_data.data()); +- + // Create source images texture. +- src_images_texture.addressMode[0] = cudaAddressModeBorder; +- src_images_texture.addressMode[1] = cudaAddressModeBorder; +- src_images_texture.addressMode[2] = cudaAddressModeBorder; +- src_images_texture.filterMode = cudaFilterModeLinear; +- src_images_texture.normalized = false; +- CUDA_SAFE_CALL(cudaBindTextureToArray(src_images_texture, +- src_images_device_->GetPtr())); ++ cudaTextureDesc texture_desc; ++ memset(&texture_desc, 0, sizeof(texture_desc)); ++ texture_desc.addressMode[0] = cudaAddressModeBorder; ++ texture_desc.addressMode[1] = cudaAddressModeBorder; ++ texture_desc.addressMode[2] = cudaAddressModeBorder; ++ texture_desc.filterMode = cudaFilterModeLinear; ++ texture_desc.readMode = cudaReadModeNormalizedFloat; ++ texture_desc.normalizedCoords = false; ++ src_images_texture_ = CudaArrayLayeredTexture::FromHostArray( ++ texture_desc, max_width, max_height, problem_.src_image_idxs.size(), ++ src_images_host_data.data()); + } + + // Upload source depth maps to device. +@@ -1504,19 +1529,18 @@ void PatchMatchCuda::InitSourceImages() { + } + } + +- src_depth_maps_device_.reset(new CudaArrayWrapper( +- max_width, max_height, problem_.src_image_idxs.size())); +- src_depth_maps_device_->CopyToDevice(src_depth_maps_host_data.data()); +- + // Create source depth maps texture. +- src_depth_maps_texture.addressMode[0] = cudaAddressModeBorder; +- src_depth_maps_texture.addressMode[1] = cudaAddressModeBorder; +- src_depth_maps_texture.addressMode[2] = cudaAddressModeBorder; +- // TODO: Check if linear interpolation improves results or not. +- src_depth_maps_texture.filterMode = cudaFilterModePoint; +- src_depth_maps_texture.normalized = false; +- CUDA_SAFE_CALL(cudaBindTextureToArray(src_depth_maps_texture, +- src_depth_maps_device_->GetPtr())); ++ cudaTextureDesc texture_desc; ++ memset(&texture_desc, 0, sizeof(texture_desc)); ++ texture_desc.addressMode[0] = cudaAddressModeBorder; ++ texture_desc.addressMode[1] = cudaAddressModeBorder; ++ texture_desc.addressMode[2] = cudaAddressModeBorder; ++ texture_desc.filterMode = cudaFilterModePoint; ++ texture_desc.readMode = cudaReadModeElementType; ++ texture_desc.normalizedCoords = false; ++ src_depth_maps_texture_ = CudaArrayLayeredTexture::FromHostArray( ++ texture_desc, max_width, max_height, problem_.src_image_idxs.size(), ++ src_depth_maps_host_data.data()); + } + } + +@@ -1576,6 +1600,15 @@ void PatchMatchCuda::InitTransforms() { + // Matrix for 90deg rotation around Z-axis in counter-clockwise direction. + const float R_z90[9] = {0, 1, 0, -1, 0, 0, 0, 0, 1}; + ++ cudaTextureDesc texture_desc; ++ memset(&texture_desc, 0, sizeof(texture_desc)); ++ texture_desc.addressMode[0] = cudaAddressModeBorder; ++ texture_desc.addressMode[1] = cudaAddressModeBorder; ++ texture_desc.addressMode[2] = cudaAddressModeBorder; ++ texture_desc.filterMode = cudaFilterModePoint; ++ texture_desc.readMode = cudaReadModeElementType; ++ texture_desc.normalizedCoords = false; ++ + for (size_t i = 0; i < 4; ++i) { + const size_t kNumTformParams = 4 + 9 + 3 + 3 + 12 + 12; + std::vector poses_host_data(kNumTformParams * +@@ -1614,20 +1647,12 @@ void PatchMatchCuda::InitTransforms() { + offset += 12; + } + +- poses_device_[i].reset(new CudaArrayWrapper( +- kNumTformParams, problem_.src_image_idxs.size(), 1)); +- poses_device_[i]->CopyToDevice(poses_host_data.data()); ++ poses_texture_[i] = CudaArrayLayeredTexture::FromHostArray( ++ texture_desc, kNumTformParams, problem_.src_image_idxs.size(), 1, ++ poses_host_data.data()); + + RotatePose(R_z90, rotated_R, rotated_T); + } +- +- poses_texture.addressMode[0] = cudaAddressModeBorder; +- poses_texture.addressMode[1] = cudaAddressModeBorder; +- poses_texture.addressMode[2] = cudaAddressModeBorder; +- poses_texture.filterMode = cudaFilterModePoint; +- poses_texture.normalized = false; +- CUDA_SAFE_CALL( +- cudaBindTextureToArray(poses_texture, poses_device_[0]->GetPtr())); + } + + void PatchMatchCuda::InitWorkspaceMemory() { +@@ -1727,15 +1752,9 @@ void PatchMatchCuda::Rotate() { + ref_image_->squared_sum_image->Rotate( + rotated_ref_image->squared_sum_image.get()); + ref_image_.swap(rotated_ref_image); ++ BindRefImageTexture(); + } + +- // Bind rotated reference image to texture. +- ref_image_device_.reset(new CudaArrayWrapper(width, height, 1)); +- ref_image_device_->CopyFromGpuMat(*ref_image_->image); +- CUDA_SAFE_CALL(cudaUnbindTexture(ref_image_texture)); +- CUDA_SAFE_CALL( +- cudaBindTextureToArray(ref_image_texture, ref_image_device_->GetPtr())); +- + // Rotate selection probability map. + prev_sel_prob_map_.reset( + new GpuMat(width, height, problem_.src_image_idxs.size())); +@@ -1751,11 +1770,6 @@ void PatchMatchCuda::Rotate() { + cost_map_.swap(rotated_cost_map); + } + +- // Rotate transformations. +- CUDA_SAFE_CALL(cudaUnbindTexture(poses_texture)); +- CUDA_SAFE_CALL(cudaBindTextureToArray( +- poses_texture, poses_device_[rotation_in_half_pi_]->GetPtr())); +- + // Rotate calibration. + CUDA_SAFE_CALL(cudaMemcpyToSymbol(ref_K, ref_K_host_[rotation_in_half_pi_], + sizeof(float) * 4, 0, +diff --git a/src/mvs/patch_match_cuda.h b/src/mvs/patch_match_cuda.h +index adbecdbd9..9e85e9b32 100644 +--- a/src/mvs/patch_match_cuda.h ++++ b/src/mvs/patch_match_cuda.h +@@ -38,7 +38,7 @@ + + #include + +-#include "mvs/cuda_array_wrapper.h" ++#include "mvs/cuda_texture.h" + #include "mvs/depth_map.h" + #include "mvs/gpu_mat.h" + #include "mvs/gpu_mat_prng.h" +@@ -54,7 +54,6 @@ class PatchMatchCuda { + public: + PatchMatchCuda(const PatchMatchOptions& options, + const PatchMatch::Problem& problem); +- ~PatchMatchCuda(); + + void Run(); + +@@ -69,6 +68,8 @@ class PatchMatchCuda { + + void ComputeCudaConfig(); + ++ void BindRefImageTexture(); ++ + void InitRefImage(); + void InitSourceImages(); + void InitTransforms(); +@@ -96,9 +97,9 @@ class PatchMatchCuda { + int rotation_in_half_pi_; + + // Reference and source image input data. +- std::unique_ptr> ref_image_device_; +- std::unique_ptr> src_images_device_; +- std::unique_ptr> src_depth_maps_device_; ++ std::unique_ptr> ref_image_texture_; ++ std::unique_ptr> src_images_texture_; ++ std::unique_ptr> src_depth_maps_texture_; + + // Relative poses from rotated versions of reference image to source images + // corresponding to _rotationInHalfPi: +@@ -114,7 +115,7 @@ class PatchMatchCuda { + // R, T, C, P, P^-1 denote the relative rotation, translation, camera + // center, projection, and inverse projection from there reference to the + // i-th source image. +- std::unique_ptr> poses_device_[4]; ++ std::unique_ptr> poses_texture_[4]; + + // Calibration matrix for rotated versions of reference image + // as {K[0, 0], K[0, 2], K[1, 1], K[1, 2]} corresponding to _rotationInHalfPi. diff --git a/recipe/1823.patch b/recipe/1823.patch new file mode 100644 index 0000000..2c31a89 --- /dev/null +++ b/recipe/1823.patch @@ -0,0 +1,39 @@ +From 821a85ba0d96ce5f53a15d8a538de268b891b3d0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Johannes=20Sch=C3=B6nberger?= +Date: Wed, 1 Mar 2023 22:10:07 +0100 +Subject: [PATCH] Remove unused SIFT GPU cuda texture reference + +--- + lib/SiftGPU/ProgramCU.cu | 6 ------ + 1 file changed, 6 deletions(-) + +diff --git a/lib/SiftGPU/ProgramCU.cu b/lib/SiftGPU/ProgramCU.cu +index 51c781341..9d842770d 100644 +--- a/lib/SiftGPU/ProgramCU.cu ++++ b/lib/SiftGPU/ProgramCU.cu +@@ -1683,9 +1683,6 @@ void ProgramCU::MultiplyDescriptorG(CuTexImage* des1, CuTexImage* des2, + MatH, hdistmax, MatF, fdistmax); + } + +- +-texture texDOT; +- + #define ROWMATCH_BLOCK_WIDTH 32 + #define ROWMATCH_BLOCK_HEIGHT 1 + +@@ -1755,15 +1752,12 @@ void ProgramCU::GetRowMatch(CuTexImage* texDot, CuTexImage* texMatch, float dist + int num2 = texDot->GetImgWidth(); + dim3 grid(1, num1/ROWMATCH_BLOCK_HEIGHT); + dim3 block(ROWMATCH_BLOCK_WIDTH, ROWMATCH_BLOCK_HEIGHT); +- // texDot->BindTexture(texDOT); + RowMatch_Kernel<<>>((int*)texDot->_cuData, + (int*)texMatch->_cuData, num2, distmax, ratiomax); + } + + #define COLMATCH_BLOCK_WIDTH 32 + +-//texture texCT; +- + void __global__ ColMatch_Kernel(int3*d_crt, int* d_result, int height, int num2, float distmax, float ratiomax) + { + int col = COLMATCH_BLOCK_WIDTH * blockIdx.x + threadIdx.x; diff --git a/recipe/1838.patch b/recipe/1838.patch new file mode 100644 index 0000000..84ae322 --- /dev/null +++ b/recipe/1838.patch @@ -0,0 +1,1148 @@ +From d361730f19a675e1f60b3fe45333441deb0e3d1d Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Johannes=20Sch=C3=B6nberger?= +Date: Sat, 11 Mar 2023 22:28:18 +0100 +Subject: [PATCH 01/14] [WIP] Upgrade SiftGPU to use CUDA texture objects + +--- + lib/SiftGPU/CuTexImage.cpp | 54 +++++++++++++++++++- + lib/SiftGPU/CuTexImage.h | 12 ++++- + lib/SiftGPU/ProgramCU.cu | 101 ++++++++++++++++++++----------------- + 3 files changed, 118 insertions(+), 49 deletions(-) + +diff --git a/lib/SiftGPU/CuTexImage.cpp b/lib/SiftGPU/CuTexImage.cpp +index a4ef59bba..be0383d2b 100644 +--- a/lib/SiftGPU/CuTexImage.cpp ++++ b/lib/SiftGPU/CuTexImage.cpp +@@ -27,6 +27,7 @@ + #include + #include + #include ++#include + using namespace std; + + +@@ -39,10 +40,48 @@ using namespace std; + #include "CuTexImage.h" + #include "ProgramCU.h" + +-#if CUDA_VERSION <= 2010 && defined(SIFTGPU_ENABLE_LINEAR_TEX2D) +-#error "Require CUDA 2.2 or higher" +-#endif ++CuTexImage::CuTexObj::~CuTexObj() ++{ ++ cudaDestroyTextureObject(handle); ++} ++ ++CuTexImage::CuTexObj CuTexImage::BindTexture(const cudaTextureDesc& textureDesc, ++ const cudaChannelFormatDesc& channelFmtDesc) ++{ ++ CuTexObj texObj; + ++ cudaResourceDesc resourceDesc; ++ memset(&resourceDesc, 0, sizeof(resourceDesc)); ++ resourceDesc.resType = cudaResourceTypeLinear; ++ resourceDesc.res.linear.devPtr = _cuData; ++ resourceDesc.res.linear.desc = channelFmtDesc; ++ resourceDesc.res.linear.sizeInBytes = _numBytes; ++ ++ cudaCreateTextureObject(&texObj.handle, &resourceDesc, &textureDesc, nullptr); ++ ProgramCU::CheckErrorCUDA("CuTexImage::BindTexture"); ++ ++ return texObj; ++} ++ ++CuTexImage::CuTexObj CuTexImage::BindTexture2D(const cudaTextureDesc& textureDesc, ++ const cudaChannelFormatDesc& channelFmtDesc) ++{ ++ CuTexObj texObj; ++ ++ cudaResourceDesc resourceDesc; ++ memset(&resourceDesc, 0, sizeof(resourceDesc)); ++ resourceDesc.resType = cudaResourceTypePitch2D; ++ resourceDesc.res.pitch2D.devPtr = _cuData; ++ resourceDesc.res.pitch2D.width = _imgWidth; ++ resourceDesc.res.pitch2D.height = _imgHeight; ++ resourceDesc.res.pitch2D.pitchInBytes = _imgWidth * _numChannel * sizeof(float); ++ resourceDesc.res.pitch2D.desc = channelFmtDesc; ++ ++ cudaCreateTextureObject(&texObj.handle, &resourceDesc, &textureDesc, nullptr); ++ ProgramCU::CheckErrorCUDA("CuTexImage::BindTexture2D"); ++ ++ return texObj; ++} + + CuTexImage::CuTexImage() + { +@@ -171,69 +210,6 @@ void CuTexImage::CopyToHost(void * buf, int stream) + cudaMemcpyAsync(buf, _cuData, _imgWidth * _imgHeight * _numChannel * sizeof(float), cudaMemcpyDeviceToHost, (cudaStream_t)stream); + } + +-void CuTexImage::InitTexture2D() +-{ +-#if !defined(SIFTGPU_ENABLE_LINEAR_TEX2D) +- if(_cuData2D && (_texWidth < _imgWidth || _texHeight < _imgHeight)) +- { +- cudaFreeArray(_cuData2D); +- _cuData2D = NULL; +- } +- +- if(_cuData2D == NULL) +- { +- _texWidth = max(_texWidth, _imgWidth); +- _texHeight = max(_texHeight, _imgHeight); +- cudaChannelFormatDesc desc; +- desc.f = cudaChannelFormatKindFloat; +- desc.x = sizeof(float) * 8; +- desc.y = _numChannel >=2 ? sizeof(float) * 8 : 0; +- desc.z = _numChannel >=3 ? sizeof(float) * 8 : 0; +- desc.w = _numChannel >=4 ? sizeof(float) * 8 : 0; +- const cudaError_t status = cudaMallocArray(&_cuData2D, &desc, _texWidth, _texHeight); +- +- if (status != cudaSuccess) { +- _cuData = NULL; +- _numBytes = 0; +- } +- +- ProgramCU::CheckErrorCUDA("CuTexImage::InitTexture2D"); +- } +-#endif +-} +- +-void CuTexImage::CopyToTexture2D() +-{ +-#if !defined(SIFTGPU_ENABLE_LINEAR_TEX2D) +- InitTexture2D(); +- +- if(_cuData2D) +- { +- cudaMemcpy2DToArray(_cuData2D, 0, 0, _cuData, _imgWidth* _numChannel* sizeof(float) , +- _imgWidth * _numChannel*sizeof(float), _imgHeight, cudaMemcpyDeviceToDevice); +- ProgramCU::CheckErrorCUDA("cudaMemcpy2DToArray"); +- } +-#endif +- +-} +- +-int CuTexImage::DebugCopyToTexture2D() +-{ +- +-/* CuTexImage tex; +- float data1[2][3] = {{1, 2, 5}, {3, 4, 5}}, data2[2][5]; +- tex.InitTexture(3, 2, 1); +- cudaMemcpy(tex._cuData, data1[0], 6 * sizeof(float), cudaMemcpyHostToDevice); +- cudaMemcpy(data1, tex._cuData, 4 * sizeof(float) , cudaMemcpyDeviceToHost); +- tex._texWidth =5; tex._texHeight = 2; +- tex.CopyToTexture2D(); +- cudaMemcpyFromArray(data2[0], tex._cuData2D, 0, 0, 10 * sizeof(float), cudaMemcpyDeviceToHost);*/ +- +- return 1; +-} +- +- +- + void CuTexImage::CopyFromPBO(int width, int height, GLuint pbo) + { + void* pbuf =NULL; +diff --git a/lib/SiftGPU/CuTexImage.h b/lib/SiftGPU/CuTexImage.h +index 0d62f6d07..1303b24cc 100644 +--- a/lib/SiftGPU/CuTexImage.h ++++ b/lib/SiftGPU/CuTexImage.h +@@ -24,13 +24,9 @@ + #ifndef CU_TEX_IMAGE_H + #define CU_TEX_IMAGE_H + +-class GLTexImage; +-struct cudaArray; +-struct textureReference; +- +-//using texture2D from linear memory ++#include + +-#define SIFTGPU_ENABLE_LINEAR_TEX2D ++class GLTexImage; + + class CuTexImage + { +@@ -45,18 +41,23 @@ class CuTexImage + int _texHeight; + GLuint _fromPBO; + public: ++ struct CuTexObj ++ { ++ cudaTextureObject_t handle; ++ ~CuTexObj(); ++ }; ++ + virtual void SetImageSize(int width, int height); + virtual bool InitTexture(int width, int height, int nchannel = 1); +- void InitTexture2D(); +- inline void BindTexture(textureReference& texRef); +- inline void BindTexture2D(textureReference& texRef); +- void CopyToTexture2D(); ++ CuTexObj BindTexture(const cudaTextureDesc& textureDesc, ++ const cudaChannelFormatDesc& channelFmtDesc); ++ CuTexObj BindTexture2D(const cudaTextureDesc& textureDesc, ++ const cudaChannelFormatDesc& channelFmtDesc); + void CopyToHost(void* buf); + void CopyToHost(void* buf, int stream); + void CopyFromHost(const void* buf); + int CopyToPBO(GLuint pbo); + void CopyFromPBO(int width, int height, GLuint pbo); +- static int DebugCopyToTexture2D(); + public: + inline int GetImgWidth(){return _imgWidth;} + inline int GetImgHeight(){return _imgHeight;} +diff --git a/lib/SiftGPU/ProgramCU.cu b/lib/SiftGPU/ProgramCU.cu +index 9d842770d..0b99ad066 100644 +--- a/lib/SiftGPU/ProgramCU.cu ++++ b/lib/SiftGPU/ProgramCU.cu +@@ -98,19 +98,33 @@ + + + __device__ __constant__ float d_kernel[KERNEL_MAX_WIDTH]; +-texture texData; +-texture texDataB; +-texture texDataF2; +-texture texDataF4; +-texture texDataI4; +-texture texDataList; +- +-//template __device__ float Conv(float *data) { return Conv(data) + data[i]*d_kernel[i];} +-//template<> __device__ float Conv<0>(float *data) { return data[0] * d_kernel[0]; } + ++const static cudaTextureDesc texDataDesc = []() { ++ cudaTextureDesc textureDesc; ++ memset(&textureDesc, 0, sizeof(textureDesc)); ++ textureDesc.readMode = cudaReadModeElementType; ++ textureDesc.addressMode[0] = cudaAddressModeClamp; ++ textureDesc.addressMode[1] = cudaAddressModeClamp; ++ textureDesc.addressMode[2] = cudaAddressModeClamp; ++ textureDesc.filterMode = cudaFilterModePoint; ++ textureDesc.normalizedCoords = false; ++ return textureDesc; ++}(); ++ ++const static cudaTextureDesc texDataBDesc = []() { ++ cudaTextureDesc textureDesc; ++ memset(&textureDesc, 0, sizeof(textureDesc)); ++ textureDesc.readMode = cudaReadModeNormalizedFloat; ++ textureDesc.addressMode[0] = cudaAddressModeClamp; ++ textureDesc.addressMode[1] = cudaAddressModeClamp; ++ textureDesc.addressMode[2] = cudaAddressModeClamp; ++ textureDesc.filterMode = cudaFilterModePoint; ++ textureDesc.normalizedCoords = false; ++ return textureDesc; ++}(); + + ////////////////////////////////////////////////////////////// +-template __global__ void FilterH( float* d_result, int width) ++template __global__ void FilterH(cudaTextureObject_t texData, float* d_result, int width) + { + + const int HALF_WIDTH = FW >> 1; +@@ -130,7 +144,7 @@ template __global__ void FilterH( float* d_result, int width) + if(cache_index < CACHE_WIDTH) + { + int fetch_index = src_index < index_min? index_min : (src_index > index_max ? index_max : src_index); +- data[cache_index] = tex1Dfetch(texData,fetch_index); ++ data[cache_index] = tex1Dfetch(texData,fetch_index); + src_index += FILTERH_TILE_WIDTH; + cache_index += FILTERH_TILE_WIDTH; + } +@@ -149,7 +163,7 @@ template __global__ void FilterH( float* d_result, int width) + + + //////////////////////////////////////////////////////////////////// +-template __global__ void FilterV(float* d_result, int width, int height) ++template __global__ void FilterV(cudaTextureObject_t texData, float* d_result, int width, int height) + { + const int HALF_WIDTH = FW >> 1; + const int CACHE_WIDTH = FW + FILTERV_TILE_HEIGHT - 1; +@@ -188,7 +202,7 @@ template __global__ void FilterV(float* d_result, int width, int heigh + if(cache_col_start < CACHE_WIDTH - i * FILTERV_BLOCK_HEIGHT) + { + int fetch_index = data_index < col ? col : (data_index > data_index_max? data_index_max : data_index); +- data[cache_index + i * FILTERV_BLOCK_HEIGHT] = tex1Dfetch(texData,fetch_index); ++ data[cache_index + i * FILTERV_BLOCK_HEIGHT] = tex1Dfetch(texData,fetch_index); + data_index += IMUL(FILTERV_BLOCK_HEIGHT, width); + } + } +@@ -218,7 +232,7 @@ template __global__ void FilterV(float* d_result, int width, int heigh + } + + +-template __global__ void UpsampleKernel(float* d_result, int width) ++template __global__ void UpsampleKernel(cudaTextureObject_t texData, float* d_result, int width) + { + const int SCALE = (1 << LOG_SCALE), SCALE_MASK = (SCALE - 1); + const float INV_SCALE = 1.0f / (float(SCALE)); +@@ -232,11 +246,11 @@ template __global__ void UpsampleKernel(float* d_result, int widt + int helper = blockIdx.y & SCALE_MASK; + if (helper) + { +- float v11 = tex1Dfetch(texData, index); +- float v12 = tex1Dfetch(texData, index + 1); ++ float v11 = tex1Dfetch(texData, index); ++ float v12 = tex1Dfetch(texData, index + 1); + index += width; +- float v21 = tex1Dfetch(texData, index); +- float v22 = tex1Dfetch(texData, index + 1); ++ float v21 = tex1Dfetch(texData, index); ++ float v22 = tex1Dfetch(texData, index + 1); + float w1 = INV_SCALE * helper, w2 = 1.0 - w1; + float v1 = (v21 * w1 + w2 * v11); + float v2 = (v22 * w1 + w2 * v12); +@@ -250,8 +264,8 @@ template __global__ void UpsampleKernel(float* d_result, int widt + } + }else + { +- float v1 = tex1Dfetch(texData, index); +- float v2 = tex1Dfetch(texData, index + 1); ++ float v1 = tex1Dfetch(texData, index); ++ float v2 = tex1Dfetch(texData, index + 1); + d_result[dst_idx] = v1; + #pragma unroll + for(int i = 1; i < SCALE; ++i) +@@ -268,19 +282,19 @@ template __global__ void UpsampleKernel(float* d_result, int widt + void ProgramCU::SampleImageU(CuTexImage *dst, CuTexImage *src, int log_scale) + { + int width = src->GetImgWidth(), height = src->GetImgHeight(); +- src->BindTexture(texData); ++ CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc()); + dim3 grid((width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height << log_scale); + dim3 block(FILTERH_TILE_WIDTH); + switch(log_scale) + { +- case 1 : UpsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, width); break; +- case 2 : UpsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, width); break; +- case 3 : UpsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, width); break; ++ case 1 : UpsampleKernel<1> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, width); break; ++ case 2 : UpsampleKernel<2> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, width); break; ++ case 3 : UpsampleKernel<3> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, width); break; + default: break; + } + } + +-template __global__ void DownsampleKernel(float* d_result, int src_width, int dst_width) ++template __global__ void DownsampleKernel(cudaTextureObject_t texData, float* d_result, int src_width, int dst_width) + { + const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x; + if(dst_col >= dst_width) return; +@@ -289,11 +303,11 @@ template __global__ void DownsampleKernel(float* d_result, int sr + const int src_row = blockIdx.y << LOG_SCALE; + const int src_idx = IMUL(src_row, src_width) + src_col; + const int dst_idx = IMUL(dst_width, dst_row) + dst_col; +- d_result[dst_idx] = tex1Dfetch(texData, src_idx); ++ d_result[dst_idx] = tex1Dfetch(texData, src_idx); + + } + +-__global__ void DownsampleKernel(float* d_result, int src_width, int dst_width, const int log_scale) ++__global__ void DownsampleKernel(cudaTextureObject_t texData, float* d_result, int src_width, int dst_width, const int log_scale) + { + const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x; + if(dst_col >= dst_width) return; +@@ -302,7 +316,7 @@ __global__ void DownsampleKernel(float* d_result, int src_width, int dst_width, + const int src_row = blockIdx.y << log_scale; + const int src_idx = IMUL(src_row, src_width) + src_col; + const int dst_idx = IMUL(dst_width, dst_row) + dst_col; +- d_result[dst_idx] = tex1Dfetch(texData, src_idx); ++ d_result[dst_idx] = tex1Dfetch(texData, src_idx); + + } + +@@ -310,28 +324,28 @@ void ProgramCU::SampleImageD(CuTexImage *dst, CuTexImage *src, int log_scale) + { + int src_width = src->GetImgWidth(), dst_width = dst->GetImgWidth() ; + +- src->BindTexture(texData); ++ CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc()); + dim3 grid((dst_width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, dst->GetImgHeight()); + dim3 block(FILTERH_TILE_WIDTH); + switch(log_scale) + { +- case 1 : DownsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width); break; +- case 2 : DownsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width); break; +- case 3 : DownsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width); break; +- default: DownsampleKernel <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width, log_scale); ++ case 1 : DownsampleKernel<1> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width); break; ++ case 2 : DownsampleKernel<2> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width); break; ++ case 3 : DownsampleKernel<3> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width); break; ++ default: DownsampleKernel <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width, log_scale); + } + } + +-__global__ void ChannelReduce_Kernel(float* d_result) ++__global__ void ChannelReduce_Kernel(cudaTextureObject_t texData, float* d_result) + { + int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x; +- d_result[index] = tex1Dfetch(texData, index*4); ++ d_result[index] = tex1Dfetch(texData, index*4); + } + +-__global__ void ChannelReduce_Convert_Kernel(float* d_result) ++__global__ void ChannelReduce_Convert_Kernel(cudaTextureObject_t texDataF4, float* d_result) + { + int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x; +- float4 rgba = tex1Dfetch(texDataF4, index); ++ float4 rgba = tex1Dfetch(texDataF4, index); + d_result[index] = 0.299f * rgba.x + 0.587f* rgba.y + 0.114f * rgba.z; + } + +@@ -343,19 +357,19 @@ void ProgramCU::ReduceToSingleChannel(CuTexImage* dst, CuTexImage* src, int conv + dim3 block(FILTERH_TILE_WIDTH); + if(convert_rgb) + { +- src->BindTexture(texDataF4); +- ChannelReduce_Convert_Kernel<<>>((float*)dst->_cuData); ++ CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc()); ++ ChannelReduce_Convert_Kernel<<>>(srcTex.handle, (float*)dst->_cuData); + }else + { +- src->BindTexture(texData); +- ChannelReduce_Kernel<<>>((float*)dst->_cuData); ++ CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc()); ++ ChannelReduce_Kernel<<>>(srcTex.handle, (float*)dst->_cuData); + } + } + +-__global__ void ConvertByteToFloat_Kernel(float* d_result) ++__global__ void ConvertByteToFloat_Kernel(cudaTextureObject_t texDataB, float* d_result) + { + int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x; +- d_result[index] = tex1Dfetch(texDataB, index); ++ d_result[index] = tex1Dfetch(texDataB, index); + } + + void ProgramCU::ConvertByteToFloat(CuTexImage*src, CuTexImage* dst) +@@ -363,8 +377,8 @@ void ProgramCU::ConvertByteToFloat(CuTexImage*src, CuTexImage* dst) + int width = src->GetImgWidth(), height = dst->GetImgHeight() ; + dim3 grid((width * height + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH); + dim3 block(FILTERH_TILE_WIDTH); +- src->BindTexture(texDataB); +- ConvertByteToFloat_Kernel<<>>((float*)dst->_cuData); ++ CuTexImage::CuTexObj srcTex = src->BindTexture(texDataBDesc, cudaCreateChannelDesc()); ++ ConvertByteToFloat_Kernel<<>>(srcTex.handle, (float*)dst->_cuData); + } + + void ProgramCU::CreateFilterKernel(float sigma, float* kernel, int& width) +@@ -403,17 +417,17 @@ template void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, C + int width = src->GetImgWidth(), height = src->GetImgHeight(); + + //horizontal filtering +- src->BindTexture(texData); ++ CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc()); + dim3 gridh((width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height); + dim3 blockh(FILTERH_TILE_WIDTH); +- FilterH<<>>((float*)buf->_cuData, width); ++ FilterH<<>>(srcTex.handle, (float*)buf->_cuData, width); + CheckErrorCUDA("FilterH"); + + ///vertical filtering +- buf->BindTexture(texData); ++ CuTexImage::CuTexObj bufTex = buf->BindTexture(texDataDesc, cudaCreateChannelDesc()); + dim3 gridv((width + FILTERV_TILE_WIDTH - 1)/ FILTERV_TILE_WIDTH, (height + FILTERV_TILE_HEIGHT - 1)/FILTERV_TILE_HEIGHT); + dim3 blockv(FILTERV_TILE_WIDTH, FILTERV_BLOCK_HEIGHT); +- FilterV<<>>((float*)dst->_cuData, width, height); ++ FilterV<<>>(bufTex.handle, (float*)dst->_cuData, width, height); + CheckErrorCUDA("FilterV"); + } + +@@ -450,24 +464,20 @@ void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf, f + } + + +-texture texC; +-texture texP; +-texture texN; +- +-void __global__ ComputeDOG_Kernel(float* d_dog, float2* d_got, int width, int height) ++void __global__ ComputeDOG_Kernel(cudaTextureObject_t texC, cudaTextureObject_t texP, float* d_dog, float2* d_got, int width, int height) + { + int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y; + int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x; + if(col < width && row < height) + { + int index = IMUL(row, width) + col; +- float vp = tex1Dfetch(texP, index); +- float v = tex1Dfetch(texC, index); ++ float vp = tex1Dfetch(texP, index); ++ float v = tex1Dfetch(texC, index); + d_dog[index] = v - vp; +- float vxn = tex1Dfetch(texC, index + 1); +- float vxp = tex1Dfetch(texC, index - 1); +- float vyp = tex1Dfetch(texC, index - width); +- float vyn = tex1Dfetch(texC, index + width); ++ float vxn = tex1Dfetch(texC, index + 1); ++ float vxp = tex1Dfetch(texC, index - 1); ++ float vyp = tex1Dfetch(texC, index - width); ++ float vyn = tex1Dfetch(texC, index + width); + float dx = vxn - vxp, dy = vyn - vyp; + float grd = 0.5f * sqrt(dx * dx + dy * dy); + float rot = (grd == 0.0f? 0.0f : atan2(dy, dx)); +@@ -475,15 +485,15 @@ void __global__ ComputeDOG_Kernel(float* d_dog, float2* d_got, int width, int he + } + } + +-void __global__ ComputeDOG_Kernel(float* d_dog, int width, int height) ++void __global__ ComputeDOG_Kernel(cudaTextureObject_t texC, cudaTextureObject_t texP, float* d_dog, int width, int height) + { + int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y; + int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x; + if(col < width && row < height) + { + int index = IMUL(row, width) + col; +- float vp = tex1Dfetch(texP, index); +- float v = tex1Dfetch(texC, index); ++ float vp = tex1Dfetch(texP, index); ++ float v = tex1Dfetch(texC, index); + d_dog[index] = v - vp; + } + } +@@ -493,19 +503,19 @@ void ProgramCU::ComputeDOG(CuTexImage* gus, CuTexImage* dog, CuTexImage* got) + int width = gus->GetImgWidth(), height = gus->GetImgHeight(); + dim3 grid((width + DOG_BLOCK_DIMX - 1)/ DOG_BLOCK_DIMX, (height + DOG_BLOCK_DIMY - 1)/DOG_BLOCK_DIMY); + dim3 block(DOG_BLOCK_DIMX, DOG_BLOCK_DIMY); +- gus->BindTexture(texC); +- (gus -1)->BindTexture(texP); ++ CuTexImage::CuTexObj texCObj = gus->BindTexture(texDataDesc, cudaCreateChannelDesc()); ++ CuTexImage::CuTexObj texPObj = (gus-1)->BindTexture(texDataDesc, cudaCreateChannelDesc()); + if(got->_cuData) +- ComputeDOG_Kernel<<>>((float*) dog->_cuData, (float2*) got->_cuData, width, height); ++ ComputeDOG_Kernel<<>>(texCObj.handle, texPObj.handle, (float*) dog->_cuData, (float2*) got->_cuData, width, height); + else +- ComputeDOG_Kernel<<>>((float*) dog->_cuData, width, height); ++ ComputeDOG_Kernel<<>>(texCObj.handle, texPObj.handle, (float*) dog->_cuData, width, height); + } + + + #define READ_CMP_DOG_DATA(datai, tex, idx) \ +- datai[0] = tex1Dfetch(tex, idx - 1);\ +- datai[1] = tex1Dfetch(tex, idx);\ +- datai[2] = tex1Dfetch(tex, idx + 1);\ ++ datai[0] = tex1Dfetch(tex, idx - 1);\ ++ datai[1] = tex1Dfetch(tex, idx);\ ++ datai[2] = tex1Dfetch(tex, idx + 1);\ + if(v > nmax)\ + {\ + nmax = max(nmax, datai[0]);\ +@@ -521,7 +531,7 @@ void ProgramCU::ComputeDOG(CuTexImage* gus, CuTexImage* dog, CuTexImage* got) + } + + +-void __global__ ComputeKEY_Kernel(float4* d_key, int width, int colmax, int rowmax, ++void __global__ ComputeKEY_Kernel(cudaTextureObject_t texP, cudaTextureObject_t texC, cudaTextureObject_t texN, float4* d_key, int width, int colmax, int rowmax, + float dog_threshold0, float dog_threshold, float edge_threshold, int subpixel_localization) + { + float data[3][3], v; +@@ -546,11 +556,11 @@ void __global__ ComputeKEY_Kernel(float4* d_key, int width, int colmax, int rowm + #endif + { + in_image = 1; +- data[1][1] = v = tex1Dfetch(texC, idx[1]); ++ data[1][1] = v = tex1Dfetch(texC, idx[1]); + if(fabs(v) <= dog_threshold0) goto key_finish; + +- data[1][0] = tex1Dfetch(texC, idx[1] - 1); +- data[1][2] = tex1Dfetch(texC, idx[1] + 1); ++ data[1][0] = tex1Dfetch(texC, idx[1] - 1); ++ data[1][2] = tex1Dfetch(texC, idx[1] + 1); + nmax = max(data[1][0], data[1][2]); + nmin = min(data[1][0], data[1][2]); + +@@ -651,18 +661,18 @@ void ProgramCU::ComputeKEY(CuTexImage* dog, CuTexImage* key, float Tdog, float T + dim3 grid((width + KEY_BLOCK_DIMX - 1)/ KEY_BLOCK_DIMX, (height + KEY_BLOCK_DIMY - 1)/KEY_BLOCK_DIMY); + #endif + dim3 block(KEY_BLOCK_DIMX, KEY_BLOCK_DIMY); +- dogp->BindTexture(texP); +- dog ->BindTexture(texC); +- dogn->BindTexture(texN); ++ CuTexImage::CuTexObj texPObj = dogp->BindTexture(texDataDesc, cudaCreateChannelDesc()); ++ CuTexImage::CuTexObj texCObj = dog->BindTexture(texDataDesc, cudaCreateChannelDesc()); ++ CuTexImage::CuTexObj texNObj = dogn->BindTexture(texDataDesc, cudaCreateChannelDesc()); + Tedge = (Tedge+1)*(Tedge+1)/Tedge; +- ComputeKEY_Kernel<<>>((float4*) key->_cuData, width, ++ ComputeKEY_Kernel<<>>(texPObj.handle, texCObj.handle, texNObj.handle, (float4*) key->_cuData, width, + width -1, height -1, Tdog1, Tdog, Tedge, GlobalUtil::_SubpixelLocalization); + + } + + + +-void __global__ InitHist_Kernel(int4* hist, int ws, int wd, int height) ++void __global__ InitHist_Kernel(cudaTextureObject_t texDataF4, int4* hist, int ws, int wd, int height) + { + int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y; + int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x; +@@ -677,7 +687,7 @@ void __global__ InitHist_Kernel(int4* hist, int ws, int wd, int height) + #pragma unroll + for(int i = 0; i < 4 ; ++i, ++scol) + { +- float4 temp = tex1Dfetch(texDataF4, sidx +i); ++ float4 temp = tex1Dfetch(texDataF4, sidx +i); + v[i] = (scol < ws -1 && scol > 0 && temp.x!=0) ? 1 : 0; + } + } +@@ -694,13 +704,13 @@ void ProgramCU::InitHistogram(CuTexImage* key, CuTexImage* hist) + int wd = hist->GetImgWidth(), hd = hist->GetImgHeight(); + dim3 grid((wd + HIST_INIT_WIDTH - 1)/ HIST_INIT_WIDTH, hd); + dim3 block(HIST_INIT_WIDTH, 1); +- key->BindTexture(texDataF4); +- InitHist_Kernel<<>>((int4*) hist->_cuData, ws, wd, hd); ++ CuTexImage::CuTexObj keyTex = key->BindTexture(texDataDesc, cudaCreateChannelDesc()); ++ InitHist_Kernel<<>>(keyTex.handle, (int4*) hist->_cuData, ws, wd, hd); + } + + + +-void __global__ ReduceHist_Kernel(int4* d_hist, int ws, int wd, int height) ++void __global__ ReduceHist_Kernel(cudaTextureObject_t texDataI4, int4* d_hist, int ws, int wd, int height) + { + int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y; + int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x; +@@ -713,7 +723,7 @@ void __global__ ReduceHist_Kernel(int4* d_hist, int ws, int wd, int height) + #pragma unroll + for(int i = 0; i < 4 && scol < ws; ++i, ++scol) + { +- int4 temp = tex1Dfetch(texDataI4, sidx + i); ++ int4 temp = tex1Dfetch(texDataI4, sidx + i); + v[i] = temp.x + temp.y + temp.z + temp.w; + } + d_hist[hidx] = make_int4(v[0], v[1], v[2], v[3]); +@@ -726,21 +736,21 @@ void ProgramCU::ReduceHistogram(CuTexImage*hist1, CuTexImage* hist2) + int wd = hist2->GetImgWidth(), hd = hist2->GetImgHeight(); + int temp = (int)floorf(logf(float(wd * 2/ 3)) / logf(2.0f)); + const int wi = min(7, max(temp , 0)); +- hist1->BindTexture(texDataI4); ++ CuTexImage::CuTexObj hist1Tex = hist1->BindTexture(texDataDesc, cudaCreateChannelDesc()); + + const int BW = 1 << wi, BH = 1 << (7 - wi); + dim3 grid((wd + BW - 1)/ BW, (hd + BH -1) / BH); + dim3 block(BW, BH); +- ReduceHist_Kernel<<>>((int4*)hist2->_cuData, ws, wd, hd); ++ ReduceHist_Kernel<<>>(hist1Tex.handle, (int4*)hist2->_cuData, ws, wd, hd); + } + + +-void __global__ ListGen_Kernel(int4* d_list, int list_len, int width) ++void __global__ ListGen_Kernel(cudaTextureObject_t texDataList, cudaTextureObject_t texDataI4, int4* d_list, int list_len, int width) + { + int idx1 = IMUL(blockIdx.x, blockDim.x) + threadIdx.x; +- int4 pos = tex1Dfetch(texDataList, idx1); ++ int4 pos = tex1Dfetch(texDataList, idx1); + int idx2 = IMUL(pos.y, width) + pos.x; +- int4 temp = tex1Dfetch(texDataI4, idx2); ++ int4 temp = tex1Dfetch(texDataI4, idx2); + int sum1 = temp.x + temp.y; + int sum2 = sum1 + temp.z; + pos.x <<= 2; +@@ -766,15 +776,18 @@ void __global__ ListGen_Kernel(int4* d_list, int list_len, int width) + void ProgramCU::GenerateList(CuTexImage* list, CuTexImage* hist) + { + int len = list->GetImgWidth(); +- list->BindTexture(texDataList); +- hist->BindTexture(texDataI4); ++ CuTexImage::CuTexObj listTex = list->BindTexture(texDataDesc, cudaCreateChannelDesc()); ++ CuTexImage::CuTexObj histTex = hist->BindTexture(texDataDesc, cudaCreateChannelDesc()); + dim3 grid((len + LISTGEN_BLOCK_DIM -1) /LISTGEN_BLOCK_DIM); + dim3 block(LISTGEN_BLOCK_DIM); +- ListGen_Kernel<<>>((int4*) list->_cuData, len, ++ ListGen_Kernel<<>>(listTex.handle, histTex.handle, (int4*) list->_cuData, len, + hist->GetImgWidth()); + } + +-void __global__ ComputeOrientation_Kernel(float4* d_list, ++void __global__ ComputeOrientation_Kernel(cudaTextureObject_t texDataF2, ++ cudaTextureObject_t texDataF4, ++ cudaTextureObject_t texDataList, ++ float4* d_list, + int list_len, + int width, int height, + float sigma, float sigma_step, +@@ -791,16 +804,16 @@ void __global__ ComputeOrientation_Kernel(float4* d_list, + float4 key; + if(existing_keypoint) + { +- key = tex1Dfetch(texDataF4, idx); ++ key = tex1Dfetch(texDataF4, idx); + }else + { +- int4 ikey = tex1Dfetch(texDataList, idx); ++ int4 ikey = tex1Dfetch(texDataList, idx); + key.x = ikey.x + 0.5f; + key.y = ikey.y + 0.5f; + key.z = sigma; + if(subpixel || keepsign) + { +- float4 offset = tex1Dfetch(texDataF4, IMUL(width, ikey.y) + ikey.x); ++ float4 offset = tex1Dfetch(texDataF4, IMUL(width, ikey.y) + ikey.x); + if(subpixel) + { + key.x += offset.y; +@@ -835,7 +848,7 @@ void __global__ ComputeOrientation_Kernel(float4* d_list, + float dy = y - key.y; + float sq_dist = dx * dx + dy * dy; + if(sq_dist >= dist_threshold) continue; +- float2 got = tex2D(texDataF2, x, y); ++ float2 got = tex2D(texDataF2, x, y); + float weight = got.x * exp(sq_dist * factor); + float fidx = floorf(got.y * ten_degree_per_radius); + int oidx = fidx; +@@ -943,21 +956,31 @@ void ProgramCU::ComputeOrientation(CuTexImage* list, CuTexImage* got, CuTexImage + int len = list->GetImgWidth(); + if(len <= 0) return; + int width = got->GetImgWidth(), height = got->GetImgHeight(); ++ CuTexImage::CuTexObj texObjF4; ++ CuTexImage::CuTexObj texObjList; + if(existing_keypoint) + { +- list->BindTexture(texDataF4); ++ texObjF4 = list->BindTexture(texDataDesc, cudaCreateChannelDesc()); + }else + { +- list->BindTexture(texDataList); +- if(GlobalUtil::_SubpixelLocalization) key->BindTexture(texDataF4); ++ texObjList = list->BindTexture(texDataDesc, cudaCreateChannelDesc()); ++ if(GlobalUtil::_SubpixelLocalization) ++ { ++ texObjF4 = key->BindTexture(texDataDesc, cudaCreateChannelDesc()); ++ } + } +- got->BindTexture2D(texDataF2); ++ ++ CuTexImage::CuTexObj gotTex = got->BindTexture2D(texDataDesc, cudaCreateChannelDesc()); + + const int block_width = len < ORIENTATION_COMPUTE_PER_BLOCK ? 16 : ORIENTATION_COMPUTE_PER_BLOCK; + dim3 grid((len + block_width -1) / block_width); + dim3 block(block_width); + +- ComputeOrientation_Kernel<<>>((float4*) list->_cuData, ++ ComputeOrientation_Kernel<<>>( ++ gotTex.handle, ++ texObjF4.handle, ++ texObjList.handle, ++ (float4*) list->_cuData, + len, width, height, sigma, sigma_step, + GlobalUtil::_OrientationGaussianFactor, + GlobalUtil::_OrientationGaussianFactor * GlobalUtil::_OrientationWindowFactor, +@@ -967,14 +990,14 @@ void ProgramCU::ComputeOrientation(CuTexImage* list, CuTexImage* got, CuTexImage + ProgramCU::CheckErrorCUDA("ComputeOrientation"); + } + +-template void __global__ ComputeDescriptor_Kernel(float4* d_des, int num, ++template void __global__ ComputeDescriptor_Kernel(cudaTextureObject_t texDataF2, cudaTextureObject_t texDataF4, float4* d_des, int num, + int width, int height, float window_factor) + { + const float rpi = 4.0/ 3.14159265358979323846; + int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x; + int fidx = idx >> 4; + if(fidx >= num) return; +- float4 key = tex1Dfetch(texDataF4, fidx); ++ float4 key = tex1Dfetch(texDataF4, fidx); + int bidx = idx& 0xf, ix = bidx & 0x3, iy = bidx >> 2; + float spt = fabs(key.z * window_factor); + float s, c; __sincosf(key.w, &s, &c); +@@ -1007,7 +1030,7 @@ template void __global__ ComputeDescriptor_Kernel(float4 + float nyn = fabs(ny); + if(nxn < 1.0f && nyn < 1.0f) + { +- float2 cc = tex2D(texDataF2, x, y); ++ float2 cc = tex2D(texDataF2, x, y); + float dnx = nx + offsetpt.x; + float dny = ny + offsetpt.y; + float ww = exp(-0.125f * (dnx * dnx + dny * dny)); +@@ -1048,14 +1071,14 @@ template void __global__ ComputeDescriptor_Kernel(float4 + } + + +-template void __global__ ComputeDescriptorRECT_Kernel(float4* d_des, int num, ++template void __global__ ComputeDescriptorRECT_Kernel(cudaTextureObject_t texDataF2, cudaTextureObject_t texDataF4, float4* d_des, int num, + int width, int height, float window_factor) + { + const float rpi = 4.0/ 3.14159265358979323846; + int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x; + int fidx = idx >> 4; + if(fidx >= num) return; +- float4 key = tex1Dfetch(texDataF4, fidx); ++ float4 key = tex1Dfetch(texDataF4, fidx); + int bidx = idx& 0xf, ix = bidx & 0x3, iy = bidx >> 2; + //float aspect_ratio = key.w / key.z; + //float aspect_sq = aspect_ratio * aspect_ratio; +@@ -1080,7 +1103,7 @@ template void __global__ ComputeDescriptorRECT_Kernel(fl + float nyn = fabs(ny); + if(nxn < 1.0f && nyn < 1.0f) + { +- float2 cc = tex2D(texDataF2, x, y); ++ float2 cc = tex2D(texDataF2, x, y); + float wx = 1.0 - nxn; + float wy = 1.0 - nyn; + float weight = wx * wy * cc.x; +@@ -1117,7 +1140,7 @@ template void __global__ ComputeDescriptorRECT_Kernel(fl + d_des[didx+1] = make_float4(des[4], des[5], des[6], des[7]); + } + +-void __global__ NormalizeDescriptor_Kernel(float4* d_des, int num) ++void __global__ NormalizeDescriptor_Kernel(cudaTextureObject_t texDataF4, float4* d_des, int num) + { + float4 temp[32]; + int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x; +@@ -1127,7 +1150,7 @@ void __global__ NormalizeDescriptor_Kernel(float4* d_des, int num) + #pragma unroll + for(int i = 0; i < 32; ++i) + { +- temp[i] = tex1Dfetch(texDataF4, sidx +i); ++ temp[i] = tex1Dfetch(texDataF4, sidx +i); + norm1 += (temp[i].x * temp[i].x + temp[i].y * temp[i].y + + temp[i].z * temp[i].z + temp[i].w * temp[i].w); + } +@@ -1161,8 +1184,8 @@ void ProgramCU::ComputeDescriptor(CuTexImage*list, CuTexImage* got, CuTexImage* + int height = got->GetImgHeight(); + + dtex->InitTexture(num * 128, 1, 1); +- got->BindTexture2D(texDataF2); +- list->BindTexture(texDataF4); ++ CuTexImage::CuTexObj gotTex = got->BindTexture2D(texDataDesc, cudaCreateChannelDesc()); ++ CuTexImage::CuTexObj listTex = list->BindTexture(texDataDesc, cudaCreateChannelDesc()); + int block_width = DESCRIPTOR_COMPUTE_BLOCK_SIZE; + dim3 grid((num * 16 + block_width -1) / block_width); + dim3 block(block_width); +@@ -1170,24 +1193,24 @@ void ProgramCU::ComputeDescriptor(CuTexImage*list, CuTexImage* got, CuTexImage* + if(rect) + { + if(GlobalUtil::_UseDynamicIndexing) +- ComputeDescriptorRECT_Kernel<<>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor); ++ ComputeDescriptorRECT_Kernel<<>>(gotTex.handle, listTex.handle, (float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor); + else +- ComputeDescriptorRECT_Kernel<<>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor); ++ ComputeDescriptorRECT_Kernel<<>>(gotTex.handle, listTex.handle, (float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor); + + }else + { + if(GlobalUtil::_UseDynamicIndexing) +- ComputeDescriptor_Kernel<<>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor); ++ ComputeDescriptor_Kernel<<>>(gotTex.handle, listTex.handle, (float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor); + else +- ComputeDescriptor_Kernel<<>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor); ++ ComputeDescriptor_Kernel<<>>(gotTex.handle, listTex.handle, (float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor); + } + if(GlobalUtil::_NormalizedSIFT) + { +- dtex->BindTexture(texDataF4); ++ CuTexImage::CuTexObj dtexTex = dtex->BindTexture(texDataDesc, cudaCreateChannelDesc()); + const int block_width = DESCRIPTOR_NORMALIZ_PER_BLOCK; + dim3 grid((num + block_width -1) / block_width); + dim3 block(block_width); +- NormalizeDescriptor_Kernel<<>>((float4*) dtex->_cuData, num); ++ NormalizeDescriptor_Kernel<<>>(dtexTex.handle, (float4*) dtex->_cuData, num); + } + CheckErrorCUDA("ComputeDescriptor"); + } +@@ -1213,14 +1236,14 @@ int ProgramCU::CheckErrorCUDA(const char* location) + } + } + +-void __global__ ConvertDOG_Kernel(float* d_result, int width, int height) ++void __global__ ConvertDOG_Kernel(cudaTextureObject_t texData, float* d_result, int width, int height) + { + int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y; + int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x; + if(col < width && row < height) + { + int index = row * width + col; +- float v = tex1Dfetch(texData, index); ++ float v = tex1Dfetch(texData, index); + d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)? + 0.5 : saturate(0.5+20.0*v); + } +@@ -1230,21 +1253,21 @@ void ProgramCU::DisplayConvertDOG(CuTexImage* dog, CuTexImage* out) + { + if(out->_cuData == NULL) return; + int width = dog->GetImgWidth(), height = dog ->GetImgHeight(); +- dog->BindTexture(texData); ++ CuTexImage::CuTexObj dogTex = dog->BindTexture(texDataDesc, cudaCreateChannelDesc()); + dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM, (height + BLOCK_DIM - 1)/BLOCK_DIM); + dim3 block(BLOCK_DIM, BLOCK_DIM); +- ConvertDOG_Kernel<<>>((float*) out->_cuData, width, height); ++ ConvertDOG_Kernel<<>>(dogTex.handle, (float*) out->_cuData, width, height); + ProgramCU::CheckErrorCUDA("DisplayConvertDOG"); + } + +-void __global__ ConvertGRD_Kernel(float* d_result, int width, int height) ++void __global__ ConvertGRD_Kernel(cudaTextureObject_t texData, float* d_result, int width, int height) + { + int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y; + int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x; + if(col < width && row < height) + { + int index = row * width + col; +- float v = tex1Dfetch(texData, index << 1); ++ float v = tex1Dfetch(texData, index << 1); + d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)? + 0 : saturate(5 * v); + +@@ -1256,14 +1279,14 @@ void ProgramCU::DisplayConvertGRD(CuTexImage* got, CuTexImage* out) + { + if(out->_cuData == NULL) return; + int width = got->GetImgWidth(), height = got ->GetImgHeight(); +- got->BindTexture(texData); ++ CuTexImage::CuTexObj gotTex = got->BindTexture(texDataDesc, cudaCreateChannelDesc()); + dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM, (height + BLOCK_DIM - 1)/BLOCK_DIM); + dim3 block(BLOCK_DIM, BLOCK_DIM); +- ConvertGRD_Kernel<<>>((float*) out->_cuData, width, height); ++ ConvertGRD_Kernel<<>>(gotTex.handle, (float*) out->_cuData, width, height); + ProgramCU::CheckErrorCUDA("DisplayConvertGRD"); + } + +-void __global__ ConvertKEY_Kernel(float4* d_result, int width, int height) ++void __global__ ConvertKEY_Kernel(cudaTextureObject_t texData, cudaTextureObject_t texDataF4, float4* d_result, int width, int height) + { + + int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y; +@@ -1271,10 +1294,10 @@ void __global__ ConvertKEY_Kernel(float4* d_result, int width, int height) + if(col < width && row < height) + { + int index = row * width + col; +- float4 keyv = tex1Dfetch(texDataF4, index); ++ float4 keyv = tex1Dfetch(texDataF4, index); + int is_key = (keyv.x == 1.0f || keyv.x == -1.0f); + int inside = col > 0 && row > 0 && row < height -1 && col < width - 1; +- float v = inside? saturate(0.5 + 20 * tex1Dfetch(texData, index)) : 0.5; ++ float v = inside? saturate(0.5 + 20 * tex1Dfetch(texData, index)) : 0.5; + d_result[index] = is_key && inside ? + (keyv.x > 0? make_float4(1.0f, 0, 0, 1.0f) : make_float4(0.0f, 1.0f, 0.0f, 1.0f)): + make_float4(v, v, v, 1.0f) ; +@@ -1284,19 +1307,19 @@ void ProgramCU::DisplayConvertKEY(CuTexImage* key, CuTexImage* dog, CuTexImage* + { + if(out->_cuData == NULL) return; + int width = key->GetImgWidth(), height = key ->GetImgHeight(); +- dog->BindTexture(texData); +- key->BindTexture(texDataF4); ++ CuTexImage::CuTexObj dogTex = dog->BindTexture(texDataDesc, cudaCreateChannelDesc()); ++ CuTexImage::CuTexObj keyTex = key->BindTexture(texDataDesc, cudaCreateChannelDesc()); + dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM, (height + BLOCK_DIM - 1)/BLOCK_DIM); + dim3 block(BLOCK_DIM, BLOCK_DIM); +- ConvertKEY_Kernel<<>>((float4*) out->_cuData, width, height); ++ ConvertKEY_Kernel<<>>(dogTex.handle, keyTex.handle, (float4*) out->_cuData, width, height); + } + + +-void __global__ DisplayKeyPoint_Kernel(float4 * d_result, int num) ++void __global__ DisplayKeyPoint_Kernel(cudaTextureObject_t texDataF4, float4 * d_result, int num) + { + int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x; + if(idx >= num) return; +- float4 v = tex1Dfetch(texDataF4, idx); ++ float4 v = tex1Dfetch(texDataF4, idx); + d_result[idx] = make_float4(v.x, v.y, 0, 1.0f); + } + +@@ -1306,17 +1329,17 @@ void ProgramCU::DisplayKeyPoint(CuTexImage* ftex, CuTexImage* out) + int block_width = 64; + dim3 grid((num + block_width -1) /block_width); + dim3 block(block_width); +- ftex->BindTexture(texDataF4); +- DisplayKeyPoint_Kernel<<>>((float4*) out->_cuData, num); ++ CuTexImage::CuTexObj ftexTex = ftex->BindTexture(texDataDesc, cudaCreateChannelDesc()); ++ DisplayKeyPoint_Kernel<<>>(ftexTex.handle, (float4*) out->_cuData, num); + ProgramCU::CheckErrorCUDA("DisplayKeyPoint"); + } + +-void __global__ DisplayKeyBox_Kernel(float4* d_result, int num) ++void __global__ DisplayKeyBox_Kernel(cudaTextureObject_t texDataF4, float4* d_result, int num) + { + int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x; + if(idx >= num) return; + int kidx = idx / 10, vidx = idx - IMUL(kidx , 10); +- float4 v = tex1Dfetch(texDataF4, kidx); ++ float4 v = tex1Dfetch(texDataF4, kidx); + float sz = fabs(v.z * 3.0f); + /////////////////////// + float s, c; __sincosf(v.w, &s, &c); +@@ -1336,24 +1359,8 @@ void ProgramCU::DisplayKeyBox(CuTexImage* ftex, CuTexImage* out) + int block_width = 32; + dim3 grid((len * 10 + block_width -1) / block_width); + dim3 block(block_width); +- ftex->BindTexture(texDataF4); +- DisplayKeyBox_Kernel<<>>((float4*) out->_cuData, len * 10); +-} +-/////////////////////////////////////////////////////////////////// +-inline void CuTexImage:: BindTexture(textureReference& texRef) +-{ +- cudaBindTexture(NULL, &texRef, _cuData, &texRef.channelDesc, _numBytes); +-} +- +-inline void CuTexImage::BindTexture2D(textureReference& texRef) +-{ +-#if defined(SIFTGPU_ENABLE_LINEAR_TEX2D) +- cudaBindTexture2D(0, &texRef, _cuData, &texRef.channelDesc, _imgWidth, _imgHeight, _imgWidth* _numChannel* sizeof(float)); +-#else +- cudaChannelFormatDesc desc; +- cudaGetChannelDesc(&desc, _cuData2D); +- cudaBindTextureToArray(&texRef, _cuData2D, &desc); +-#endif ++ CuTexImage::CuTexObj ftexTex = ftex->BindTexture(texDataDesc, cudaCreateChannelDesc()); ++ DisplayKeyBox_Kernel<<>>(ftexTex.handle, (float4*) out->_cuData, len * 10); + } + + int ProgramCU::CheckCudaDevice(int device) +@@ -1401,11 +1408,7 @@ int ProgramCU::CheckCudaDevice(int device) + #define MULT_BLOCK_DIMX (MULT_TBLOCK_DIMX) + #define MULT_BLOCK_DIMY (8 * MULT_TBLOCK_DIMY) + +- +-texture texDes1; +-texture texDes2; +- +-void __global__ MultiplyDescriptor_Kernel(int* d_result, int num1, int num2, int3* d_temp) ++void __global__ MultiplyDescriptor_Kernel(cudaTextureObject_t texDes1, cudaTextureObject_t texDes2, int* d_result, int num1, int num2, int3* d_temp) + { + int idx01 = (blockIdx.y * MULT_BLOCK_DIMY), idx02 = (blockIdx.x * MULT_BLOCK_DIMX); + +@@ -1419,13 +1422,13 @@ void __global__ MultiplyDescriptor_Kernel(int* d_result, int num1, int num2, int + //Load feature descriptors + /////////////////////////////////////////////////////////////// + #if MULT_BLOCK_DIMY == 16 +- uint4 v = tex1Dfetch(texDes1, read_idx1); ++ uint4 v = tex1Dfetch(texDes1, read_idx1); + data1[cache_idx1] = v.x; data1[cache_idx1+1] = v.y; + data1[cache_idx1+2] = v.z; data1[cache_idx1+3] = v.w; + #elif MULT_BLOCK_DIMY == 8 + if(threadIdx.x < 64) + { +- uint4 v = tex1Dfetch(texDes1, read_idx1); ++ uint4 v = tex1Dfetch(texDes1, read_idx1); + data1[cache_idx1] = v.x; data1[cache_idx1+1] = v.y; + data1[cache_idx1+2] = v.z; data1[cache_idx1+3] = v.w; + } +@@ -1446,7 +1449,7 @@ void __global__ MultiplyDescriptor_Kernel(int* d_result, int num1, int num2, int + #pragma unroll + for(int i = 0; i < 8; ++i) + { +- uint4 v = tex1Dfetch(texDes2, read_idx2 + i); ++ uint4 v = tex1Dfetch(texDes2, read_idx2 + i); + unsigned char* p2 = (unsigned char*)(&v); + #pragma unroll + for(int k = 0; k < MULT_BLOCK_DIMY; ++k) +@@ -1501,20 +1504,23 @@ void ProgramCU::MultiplyDescriptor(CuTexImage* des1, CuTexImage* des2, CuTexImag + dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY); + texDot->InitTexture( num2,num1); + if(texCRT) texCRT->InitTexture(num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 32); +- des1->BindTexture(texDes1); +- des2->BindTexture(texDes2); ++ CuTexImage::CuTexObj des1Tex = des1->BindTexture(texDataDesc, cudaCreateChannelDesc()); ++ CuTexImage::CuTexObj des2Tex = des2->BindTexture(texDataDesc, cudaCreateChannelDesc()); + +- MultiplyDescriptor_Kernel<<>>((int*)texDot->_cuData, num1, num2, ++ MultiplyDescriptor_Kernel<<>>(des1Tex.handle, des2Tex.handle, (int*)texDot->_cuData, num1, num2, + (texCRT? (int3*)texCRT->_cuData : NULL)); + } + +-texture texLoc1; +-texture texLoc2; +-struct Matrix33{float mat[3][3];}; ++struct Matrix33 ++{ ++ float mat[3][3]; ++}; + + + +-void __global__ MultiplyDescriptorG_Kernel(int* d_result, int num1, int num2, int3* d_temp, ++void __global__ MultiplyDescriptorG_Kernel(cudaTextureObject_t texDes1, cudaTextureObject_t texDes2, ++ cudaTextureObject_t texLoc1, cudaTextureObject_t texLoc2, ++ int* d_result, int num1, int num2, int3* d_temp, + Matrix33 H, float hdistmax, Matrix33 F, float fdistmax) + { + int idx01 = (blockIdx.y * MULT_BLOCK_DIMY); +@@ -1529,7 +1535,7 @@ void __global__ MultiplyDescriptorG_Kernel(int* d_result, int num1, int num2, in + int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2; + int cache_idx1 = IMUL(row4, 17) + (col4 << 2); + #if MULT_BLOCK_DIMY == 16 +- uint4 v = tex1Dfetch(texDes1, read_idx1); ++ uint4 v = tex1Dfetch(texDes1, read_idx1); + data1[cache_idx1] = v.x; + data1[cache_idx1+1] = v.y; + data1[cache_idx1+2] = v.z; +@@ -1537,7 +1543,7 @@ void __global__ MultiplyDescriptorG_Kernel(int* d_result, int num1, int num2, in + #elif MULT_BLOCK_DIMY == 8 + if(threadIdx.x < 64) + { +- uint4 v = tex1Dfetch(texDes1, read_idx1); ++ uint4 v = tex1Dfetch(texDes1, read_idx1); + data1[cache_idx1] = v.x; + data1[cache_idx1+1] = v.y; + data1[cache_idx1+2] = v.z; +@@ -1549,7 +1555,7 @@ void __global__ MultiplyDescriptorG_Kernel(int* d_result, int num1, int num2, in + __syncthreads(); + if(threadIdx.x < MULT_BLOCK_DIMY * 2) + { +- loc1[threadIdx.x] = tex1Dfetch(texLoc1, 2 * idx01 + threadIdx.x); ++ loc1[threadIdx.x] = tex1Dfetch(texLoc1, 2 * idx01 + threadIdx.x); + } + __syncthreads(); + if(idx2 >= num2) return; +@@ -1558,7 +1564,7 @@ void __global__ MultiplyDescriptorG_Kernel(int* d_result, int num1, int num2, in + //geometric verification + ///////////////////////////////////////////////////////////////////////////////////////////// + int good_count = 0; +- float2 loc2 = tex1Dfetch(texLoc2, idx2); ++ float2 loc2 = tex1Dfetch(texLoc2, idx2); + #pragma unroll + for(int i = 0; i < MULT_BLOCK_DIMY; ++i) + { +@@ -1608,7 +1614,7 @@ void __global__ MultiplyDescriptorG_Kernel(int* d_result, int num1, int num2, in + #pragma unroll + for(int i = 0; i < 8; ++i) + { +- uint4 v = tex1Dfetch(texDes2, read_idx2 + i); ++ uint4 v = tex1Dfetch(texDes2, read_idx2 + i); + unsigned char* p2 = (unsigned char*)(&v); + #pragma unroll + for(int k = 0; k < MULT_BLOCK_DIMY; ++k) +@@ -1674,11 +1680,12 @@ void ProgramCU::MultiplyDescriptorG(CuTexImage* des1, CuTexImage* des2, + //intermediate results + texDot->InitTexture( num2,num1); + if(texCRT) texCRT->InitTexture( num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 3); +- loc1->BindTexture(texLoc1); +- loc2->BindTexture(texLoc2); +- des1->BindTexture(texDes1); +- des2->BindTexture(texDes2); +- MultiplyDescriptorG_Kernel<<>>((int*)texDot->_cuData, num1, num2, ++ CuTexImage::CuTexObj loc1Tex = loc1->BindTexture(texDataDesc, cudaCreateChannelDesc()); ++ CuTexImage::CuTexObj loc2Tex = loc2->BindTexture(texDataDesc, cudaCreateChannelDesc()); ++ CuTexImage::CuTexObj des1Tex = des1->BindTexture(texDataDesc, cudaCreateChannelDesc()); ++ CuTexImage::CuTexObj des2Tex = des2->BindTexture(texDataDesc, cudaCreateChannelDesc()); ++ MultiplyDescriptorG_Kernel<<>>(des1Tex.handle, des2Tex.handle, loc1Tex.handle, loc2Tex.handle, ++ (int*)texDot->_cuData, num1, num2, + (texCRT? (int3*)texCRT->_cuData : NULL), + MatH, hdistmax, MatF, fdistmax); + } +diff --git a/lib/SiftGPU/PyramidCU.cpp b/lib/SiftGPU/PyramidCU.cpp +index ea6711931..074b442da 100644 +--- a/lib/SiftGPU/PyramidCU.cpp ++++ b/lib/SiftGPU/PyramidCU.cpp +@@ -237,7 +237,6 @@ void PyramidCU::ResizePyramid(int w, int h) + if( j >= 1 && j < 1 + param._dog_level_num) + { + got->InitTexture(wa, h, 2); //2 * nlev - 6 +- got->InitTexture2D(); + } + if(j > 1 && j < nlev -1) key->InitTexture(wa, h, 4); // nlev -3 ; 4 * nlev - 12 + } +@@ -296,7 +295,6 @@ void PyramidCU::FitPyramid(int w, int h) + if( j >= 1 && j < 1 + param._dog_level_num) + { + got->InitTexture(wa, h, 2); //2 * nlev - 6 +- got->InitTexture2D(); + } + if(j > 1 && j < nlev -1) key->InitTexture(wa, h, 4); // nlev -3 ; 4 * nlev - 12 + } +@@ -1084,7 +1082,7 @@ void PyramidCU::CopyGradientTex() + //compute the gradient + for(int j = 0; j < param._dog_level_num ; j++, got++, idx++) + { +- if(_levelFeatureNum[idx] > 0) got->CopyToTexture2D(); ++ // if(_levelFeatureNum[idx] > 0) got->CopyToTexture2D(); + } + } + if(GlobalUtil::_timingS) diff --git a/recipe/1840.patch b/recipe/1840.patch new file mode 100644 index 0000000..6629898 --- /dev/null +++ b/recipe/1840.patch @@ -0,0 +1,14723 @@ +From c8b6656fe83810e5edc059cb0b0c75528b905dab Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Johannes=20Sch=C3=B6nberger?= +Date: Sun, 12 Mar 2023 15:43:14 +0100 +Subject: [PATCH 1/2] Remove PBA as bundle adjustment backend to support CUDA + 12+ + +--- + CMakeLists.txt | 5 +- + cmake/CMakeConfig.cmake.in | 1 - + doc/bibliography.rst | 4 - + doc/tutorial.rst | 6 +- + lib/CMakeLists.txt | 1 - + lib/PBA/CMakeLists.txt | 41 - + lib/PBA/ConfigBA.cpp | 589 --- + lib/PBA/ConfigBA.h | 226 -- + lib/PBA/CuTexImage.cpp | 136 - + lib/PBA/CuTexImage.h | 83 - + lib/PBA/DataInterface.h | 423 --- + lib/PBA/LICENSE | 674 ---- + lib/PBA/ProgramCU.cu | 3637 ------------------- + lib/PBA/ProgramCU.h | 127 - + lib/PBA/SparseBundleCPU.cpp | 4369 ----------------------- + lib/PBA/SparseBundleCPU.h | 286 -- + lib/PBA/SparseBundleCU.cpp | 1989 ----------- + lib/PBA/SparseBundleCU.h | 176 - + lib/PBA/pba.cpp | 134 - + lib/PBA/pba.h | 156 - + lib/PBA/util.h | 753 ---- + src/controllers/incremental_mapper.cc | 22 +- + src/controllers/incremental_mapper.h | 7 - + src/optim/bundle_adjustment.cc | 253 -- + src/optim/bundle_adjustment.h | 66 - + src/optim/bundle_adjustment_test.cc | 108 - + src/sfm/incremental_mapper.cc | 33 - + src/sfm/incremental_mapper.h | 5 +- + src/ui/license_widget.cc | 19 - + src/ui/license_widget.h | 1 - + src/ui/reconstruction_options_widget.cc | 3 - + src/util/option_manager.cc | 4 - + 32 files changed, 4 insertions(+), 14333 deletions(-) + delete mode 100644 lib/PBA/CMakeLists.txt + delete mode 100644 lib/PBA/ConfigBA.cpp + delete mode 100644 lib/PBA/ConfigBA.h + delete mode 100644 lib/PBA/CuTexImage.cpp + delete mode 100644 lib/PBA/CuTexImage.h + delete mode 100644 lib/PBA/DataInterface.h + delete mode 100755 lib/PBA/LICENSE + delete mode 100644 lib/PBA/ProgramCU.cu + delete mode 100644 lib/PBA/ProgramCU.h + delete mode 100644 lib/PBA/SparseBundleCPU.cpp + delete mode 100644 lib/PBA/SparseBundleCPU.h + delete mode 100644 lib/PBA/SparseBundleCU.cpp + delete mode 100644 lib/PBA/SparseBundleCU.h + delete mode 100644 lib/PBA/pba.cpp + delete mode 100644 lib/PBA/pba.h + delete mode 100644 lib/PBA/util.h + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 2a9724e15..e4d6e436b 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -264,7 +264,7 @@ if(CUDA_ENABLED AND CUDA_FOUND) + # Do not show warnings if the architectures are deprecated. + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") + # Do not show warnings if cuda library functions are deprecated. +- set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-declarations") ++ # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-declarations") + # Explicitly set PIC flags for CUDA targets. + if(NOT IS_MSVC) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options -fPIC") +@@ -404,7 +404,6 @@ endif() + + set(COLMAP_INTERNAL_LIBRARIES + lsd +- pba + poisson_recon + sift_gpu + # vlfeat +@@ -427,7 +426,6 @@ add_subdirectory(src) + ################################################################################ + + COLMAP_ADD_SOURCE_DIR(lib/LSD LIB_LSD_SRCS *.h *.c) +-COLMAP_ADD_SOURCE_DIR(lib/PBA LIB_PBA_SRCS *.h *.cpp *.cu) + COLMAP_ADD_SOURCE_DIR(lib/PoissonRecon LIB_POISSON_RECON_SRCS *.h *.cpp *.inl) + COLMAP_ADD_SOURCE_DIR(lib/SiftGPU LIB_SIFT_GPU_SRCS *.h *.cpp *.cu) + # COLMAP_ADD_SOURCE_DIR(lib/VLFeat LIB_VLFEAT_SRCS *.h *.c *.tc) +@@ -451,7 +449,6 @@ COLMAP_ADD_SOURCE_DIR(src/util UTIL_SRCS *.h *.cc) + add_library( + ${COLMAP_SRC_ROOT_FOLDER} + ${LIB_LSD_SRCS} +- ${LIB_PBA_SRCS} + ${LIB_POISSON_RECON_SRCS} + ${LIB_SIFT_GPU_SRCS} + # ${LIB_VLFEAT_SRCS} +diff --git a/cmake/CMakeConfig.cmake.in b/cmake/CMakeConfig.cmake.in +index d6133f027..755fcab32 100644 +--- a/cmake/CMakeConfig.cmake.in ++++ b/cmake/CMakeConfig.cmake.in +@@ -167,7 +167,6 @@ set(COLMAP_LINK_DIRS + + set(COLMAP_INTERNAL_LIBRARIES + lsd +- pba + poisson_recon + sqlite3 + sift_gpu +diff --git a/doc/bibliography.rst b/doc/bibliography.rst +index 4845bc83d..1922adb48 100755 +--- a/doc/bibliography.rst ++++ b/doc/bibliography.rst +@@ -40,9 +40,5 @@ Bibliography + .. [lowe04] Lowe, David G. "Distinctive image features from scale-invariant + keypoints". International journal of computer vision 60.2 (2004): 91-110. + +-.. [wu11] Wu, Changchang, Sameer Agarwal, Brian Curless, +- and Steven M. Seitz. "Multicore bundle adjustment." +- Conference on Computer Vision and Pattern Recognition, 2011. +- + .. [wu13] Wu, Changchang. "Towards linear-time incremental structure from + motion." International Conference 3D Vision, 2013. +diff --git a/doc/tutorial.rst b/doc/tutorial.rst +index cdf6701e1..fcd071523 100755 +--- a/doc/tutorial.rst ++++ b/doc/tutorial.rst +@@ -384,11 +384,7 @@ available controls. COLMAP attempts to reconstruct multiple models if not all + images are registered into the same model. The different models can be selected + from the drop-down menu in the toolbar. If the different models have common + registered images, you can use the ``model_converter`` executable to merge them +-into a single reconstruction (see :ref:`FAQ ` for details). If +-all your images use the `SIMPLE_RADIAL` camera model (default) without shared +-intrinsics, you can use PBA [wu11]_ instead of Ceres Solver [ceres]_ for fast +-bundle adjustment, which can be activated in the reconstruction options under +-the bundle adjustment section (`use_pba=true`). ++into a single reconstruction (see :ref:`FAQ ` for details). + + Ideally, the reconstruction works fine and all images are registered. If this is + not the case, it is recommended to: +diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt +index a6c26e7..a61e438 100644 +--- a/lib/CMakeLists.txt ++++ b/lib/CMakeLists.txt +@@ -36,7 +36,6 @@ elseif(IS_GNU OR IS_CLANG) + endif() + + add_subdirectory(LSD) +-add_subdirectory(PBA) + add_subdirectory(PoissonRecon) + add_subdirectory(SiftGPU) + # add_subdirectory(VLFeat) +diff --git a/lib/PBA/CMakeLists.txt b/lib/PBA/CMakeLists.txt +deleted file mode 100644 +index 2473436e6..000000000 +--- a/lib/PBA/CMakeLists.txt ++++ /dev/null +@@ -1,41 +0,0 @@ +-if(NOT IS_MSVC) +- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing") +-endif() +- +-if(NOT SIMD_ENABLED) +- add_definitions("-DDISABLE_CPU_NEON") +- add_definitions("-DDISABLE_CPU_AVX") +- add_definitions("-DDISABLE_CPU_SSE") +-endif() +- +-if(CUDA_ENABLED) +- COLMAP_ADD_CUDA_LIBRARY(pba +- ConfigBA.cpp +- ConfigBA.h +- CuTexImage.cpp +- CuTexImage.h +- DataInterface.h +- pba.cpp +- pba.h +- ProgramCU.cu +- ProgramCU.h +- SparseBundleCPU.cpp +- SparseBundleCPU.h +- SparseBundleCU.cpp +- SparseBundleCU.h +- util.h +- ) +-else() +- add_definitions("-DPBA_NO_GPU") +- +- COLMAP_ADD_LIBRARY(pba +- ConfigBA.cpp +- ConfigBA.h +- DataInterface.h +- pba.cpp +- pba.h +- SparseBundleCPU.cpp +- SparseBundleCPU.h +- util.h +- ) +-endif() +diff --git a/lib/PBA/ConfigBA.cpp b/lib/PBA/ConfigBA.cpp +deleted file mode 100644 +index f59209477..000000000 +--- a/lib/PBA/ConfigBA.cpp ++++ /dev/null +@@ -1,589 +0,0 @@ +-//////////////////////////////////////////////////////////////////////////// +-// File: ConfigBA.cpp +-// Author: Changchang Wu +-// Description : implementation of the configuration object class +-// +-// Copyright (c) 2011 Changchang Wu (ccwu@cs.washington.edu) +-// and the University of Washington at Seattle +-// +-// This library is free software; you can redistribute it and/or +-// modify it under the terms of the GNU General Public +-// License as published by the Free Software Foundation; either +-// Version 3 of the License, or (at your option) any later version. +-// +-// This library is distributed in the hope that it will be useful, +-// but WITHOUT ANY WARRANTY; without even the implied warranty of +-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +-// General Public License for more details. +-// +-//////////////////////////////////////////////////////////////////////////////// +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-using std::cout; +-using std::ofstream; +-using std::string; +- +-#ifndef _WIN32 +-#include +-#endif +- +-#include "ConfigBA.h" +- +-#ifdef _MSC_VER +-#define strcpy strcpy_s +-#define sprintf sprintf_s +-#endif +- +-namespace pba { +- +-ConfigBA::ConfigBA() { +- __lm_max_iteration = 50; +- __lm_initial_damp = 1e-3f; +- __lm_minimum_damp = 1e-10f; +- __lm_maximum_damp = 1e+5f; +- __lm_delta_threshold = 1e-6f; +- __lm_gradient_threshold = 1e-10f; +- __lm_mse_threshold = 0.25f; +- __lm_use_diagonal_damp = true; +- __lm_check_gradient = false; +- __lm_damping_auto_switch = 0; +- __bundle_time_budget = 0; +- __bundle_mode_next = 0; +- __bundle_current_mode = 0; +- +- //////////////////////////// +- __cg_max_iteration = 100; +- __cg_min_iteration = 10; +- __cg_recalculate_freq = 0; +- __cg_norm_threshold = 0.1f; +- __cg_norm_guard = 1.0f; +- __pba_experimental = 0; +- __cg_schur_complement = 0; +- +- //////////////////////////// +- __fixed_intrinsics = false; +- __use_radial_distortion = 0; +- __reset_initial_distortion = false; +- +- ////////////////////////////// +- __verbose_level = 2; +- __verbose_cg_iteration = false; +- __verbose_function_time = false; +- __verbose_allocation = false; +- __verbose_sse = false; +- __save_gradient_norm = false; +- __stat_filename = NULL; +- __matlab_format_stat = true; +- +- ///////////////////////////// +- __jc_store_transpose = true; +- __jc_store_original = true; +- __no_jacobian_store = false; +- +- __focal_normalize = true; +- __depth_normalize = true; +- __depth_degeneracy_fix = true; +- __jacobian_normalize = true; +- __data_normalize_median = 0.5f; +- __depth_check_epsilon = 0.01f; +- +- //////////////////////////// +- __multiply_jx_usenoj = true; +- +- //////////////////////////// +- __accurate_gain_ratio = true; +- //////////////////////////// +- __cpu_data_precision = 0; +- __current_device = -1; +- __selected_device = -1; +- __memory_usage = 0; +- __current_iteration = 0; +- __num_cpu_thread_all = 0; +- +- /////////////////////// +- __debug_pba = false; +- __profile_pba = 0; +- __cpu_thread_profile = false; +- __warmup_device = false; +- +- /////////////////////// +- __driver_output = NULL; +- +- ////////////////////////// +- ResetBundleStatistics(); +-} +- +-void ConfigBA::ResetBundleStatistics() { +- __abort_flag = false; +- __num_lm_success = 0; +- __num_lm_iteration = 0; +- __num_cg_iteration = 0; +- __num_projection_eval = 0; +- __num_jacobian_eval = 0; +- __num_camera_modified = 0; +- __num_point_behind = 0; +- __initial_mse = 0; +- __final_mse = 0; +- __final_mse_x = 0; +- __focal_scaling = 1.0f; +- __depth_scaling = 1.0f; +- __pba_return_code = 0; +- __current_iteration = 0; +- __warmup_device = false; +- __bundle_current_mode = __bundle_mode_next; +- for (int i = 0; i < NUM_TIMER; ++i) __timer_record[i] = 0; +- __bundle_records.resize(0); +- if (__num_cpu_thread_all) { +- std::cout << "WARNING: set all thread number to " << __num_cpu_thread_all +- << '\n'; +- for (int i = 0; i < NUM_FUNC; ++i) +- __num_cpu_thread[i] = __num_cpu_thread_all; +- } +-} +- +-void ConfigBA::ResetTemporarySetting() { +- __reset_initial_distortion = false; +- __bundle_time_budget = 0; +- __bundle_mode_next = 0; +- __bundle_current_mode = 0; +- __stat_filename = NULL; +- if (__lm_damping_auto_switch > 0 && !__lm_use_diagonal_damp) +- __lm_use_diagonal_damp = true; +-} +- +-void ConfigBA::SaveBundleStatistics(int ncam, int npt, int nproj) { +- if (__profile_pba) return; +- if (__stat_filename && __bundle_records.size() > 0) { +- char filenamebuf[1024]; +- char* ret = strchr(__stat_filename, '\r'); +- if (ret) ret[0] = 0; +- char* dot = strrchr(__stat_filename, '.'); +- if (dot && strchr(dot, '/') == NULL && strchr(dot, '\\') == NULL) +- strcpy(filenamebuf, __stat_filename); // if filename has extension, use +- // it +- else +- sprintf(filenamebuf, "%s%s%s%s%s%s%s%s%s.%s", __stat_filename, +- __cpu_data_precision == 0 ? "_gpu" : "_cpu", +- __cpu_data_precision == sizeof(double) ? "d" : "", +- __cg_schur_complement ? "_schur" : "\0", +- __lm_use_diagonal_damp +- ? "\0" +- : (__lm_damping_auto_switch > 0 ? "_ad" : "_id"), +- __use_radial_distortion == -1 +- ? "_md" +- : (__use_radial_distortion ? "_pd" : "\0"), +- __jacobian_normalize ? "\0" : "_nojn", +- __focal_normalize || __depth_normalize ? "\0" : "_nodn", +- __depth_degeneracy_fix ? "\0" : "_nodf", +- __matlab_format_stat ? "m" : "log"); +- +- /////////////////////////////////////////////////////// +- ofstream out(filenamebuf); +- out << std::left; +- +- float overhead = +- (BundleTimerGet(TIMER_OVERALL) - BundleTimerGet(TIMER_OPTIMIZATION)); +- if (__matlab_format_stat) +- out << "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n" +- << "ncam = " << ncam << "; npt = " << npt << "; nproj = " << nproj +- << ";\n" +- << "%% overhead = " << overhead << ";\n" +- << "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n" +- << "%% " << std::setw(10) << __num_lm_iteration +- << "\t linear systems solved;\n" +- << "%% " << std::setw(10) << __num_cg_iteration +- << "\t conjugated gradient steps;\n" +- << "%% " << std::setw(10) << BundleTimerGet(TIMER_OVERALL) +- << "\t seconds used overall;\n" +- << "%% " << std::setw(10) << BundleTimerGet(TIMER_PREPROCESSING) +- << "\t seconds on pre-processing;\n" +- << "%% " << std::setw(10) +- << BundleTimerGet(TIMER_GPU_UPLOAD) + +- BundleTimerGet(TIMER_GPU_ALLOCATION) +- << "\t seconds on upload;\n" +- << "%% " << std::setw(10) << BundleTimerGet(TIMER_OPTIMIZATION) +- << "\t seconds on optimization;\n" +- << "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n" +- << (__cpu_data_precision == 0 ? "gpustat" : "cpustat") +- << (__cpu_data_precision == sizeof(double) ? "_db" : "") +- << (__jacobian_normalize ? "" : "_nojn") +- << (__depth_degeneracy_fix ? "" : "_nodf") +- << (__cg_schur_complement ? "_schur" : "") << " = [\n"; +- +- for (size_t i = 0; i < __bundle_records.size(); ++i) +- out << std::setw((i % 7 > 2) ? ((i % 7 > 4 && !__save_gradient_norm && +- !__lm_check_gradient) +- ? 0 +- : 12) +- : 5) +- << (__bundle_records[i] + (i == 1 ? overhead : 0)) +- << (i % 7 == 6 ? '\n' : '\t'); +- +- if (__matlab_format_stat) out << "];\n\n"; +- +- if (__verbose_level) +- std::cout << "\n---------------------------------------\n" << filenamebuf; +- } +-} +- +-#define REPORT_FUNCTION_TIME(FID) \ +- std::setw(5) << (((int)(BundleTimerGet(FID) * 100 + 50)) * 0.01) << "(" \ +- << std::setw(2) \ +- << 0.1f * ((int)(1000 * BundleTimerGet(FID) / \ +- BundleTimerGet(TIMER_OPTIMIZATION))) \ +- << "%)" +- +-void ConfigBA::PrintBundleStatistics() { +- if (__profile_pba) return; +- +- if (__verbose_level) +- std::cout << "\n---------------------------------------\n" << std::setw(10) +- << __num_lm_success << "\t successful iterations;\n" +- << std::setw(10) << __num_lm_iteration +- << "\t linear systems solved;\n" << std::setw(10) +- << __num_cg_iteration << "\t conjugated gradient steps;\n" +- << std::setw(10) << BundleTimerGet(TIMER_OVERALL) +- << "\t seconds used overall;\n" << std::setw(10) +- << BundleTimerGet(TIMER_GPU_ALLOCATION) +- << "\t seconds on allocation;\n" << std::setw(10) +- << BundleTimerGet(TIMER_PREPROCESSING) +- << "\t seconds on pre-processing;\n" << std::setw(10) +- << BundleTimerGet(TIMER_GPU_UPLOAD) << "\t seconds on upload;\n" +- << std::setw(10) << BundleTimerGet(TIMER_OPTIMIZATION) +- << "\t seconds on optimization;\n"; +- if (__verbose_level && __cpu_data_precision) +- std::cout << REPORT_FUNCTION_TIME(TIMER_FUNCTION_JJ) +- << "\t seconds on jacobians;\n" +- << REPORT_FUNCTION_TIME(TIMER_FUNCTION_PJ) +- << "\t seconds on projections;\n" +- << REPORT_FUNCTION_TIME(TIMER_FUNCTION_JX) +- << "\t seconds on JX;\n" +- << REPORT_FUNCTION_TIME(TIMER_FUNCTION_JTE) +- << "\t seconds on JtE;\n" +- << REPORT_FUNCTION_TIME(TIMER_FUNCTION_BC) +- << "\t seconds to compute preconditioner;\n" +- << REPORT_FUNCTION_TIME(TIMER_FUNCTION_MP) +- << "\t seconds to apply preconditioner;\n" +- << REPORT_FUNCTION_TIME(TIMER_FUNCTION_UP) +- << "\t seconds to update parameters;\n"; +- if (__verbose_level) +- std::cout << "---------------------------------------\n" +- << "mse = " << __initial_mse << " -> " << __final_mse << "" +- << " (" << __final_mse_x +- << (__use_radial_distortion == -1 ? 'D' : 'U') << ")\n" +- << "---------------------------------------\n"; +-} +- +-double ConfigBA::MyClock() { +-#ifdef _WIN32 +- return clock() / double(CLOCKS_PER_SEC); +-#else +- static int started = 0; +- static struct timeval tstart; +- if (started == 0) { +- gettimeofday(&tstart, NULL); +- started = 1; +- return 0; +- } else { +- struct timeval now; +- gettimeofday(&now, NULL); +- return ((now.tv_usec - tstart.tv_usec) / 1000000.0 + +- (now.tv_sec - tstart.tv_sec)); +- } +-#endif +-} +- +-void ConfigBA::BundleTimerStart(int timer) { +- __timer_record[timer] = MyClock(); +-} +- +-void ConfigBA::BundleTimerSwitch(int timer) { +- __timer_record[timer] = MyClock() - __timer_record[timer]; +-} +- +-void ConfigBA::BundleTimerSwap(int timer1, int timer2) { +- BundleTimerSwitch(timer1); +- BundleTimerSwitch(timer2); +-} +- +-float ConfigBA::BundleTimerGet(int timer) { +- return float(__timer_record[timer]); +-} +- +-float ConfigBA::BundleTimerGetNow(int timer) { +- return 0.01f * ((int)(100 * (MyClock() - __timer_record[timer]))); +-} +- +-bool ConfigBA::IsTimeBudgetAvailable() { +- if (__bundle_time_budget <= 0) return true; +- return BundleTimerGetNow(TIMER_OVERALL) < __bundle_time_budget; +-} +- +-void ConfigBA::SaveBundleRecord(int iter, float res, float damping, float gn, +- float gi) { +- __bundle_records.push_back(float(iter)); +- __bundle_records.push_back(BundleTimerGetNow()); +- __bundle_records.push_back(float(__num_cg_iteration)); +- __bundle_records.push_back(res); +- __bundle_records.push_back(damping); +- __bundle_records.push_back(gn); +- __bundle_records.push_back(gi); +-} +- +-void ConfigBA::ParseParam(int argc, char** argv) { +-#define CHAR1_TO_INT(x) ((x >= 'A' && x <= 'Z') ? x + 32 : x) +-#define CHAR2_TO_INT(str, i) \ +- (str[i] ? CHAR1_TO_INT(str[i]) + (CHAR1_TO_INT(str[i + 1]) << 8) : 0) +-#define CHAR3_TO_INT(str, i) \ +- (str[i] ? CHAR1_TO_INT(str[i]) + (CHAR2_TO_INT(str, i + 1) << 8) : 0) +-#define STRING_TO_INT(str) (CHAR1_TO_INT(str[0]) + (CHAR3_TO_INT(str, 1) << 8)) +- +-#ifdef _MSC_VER +-// charizing is microsoft only +-#define MAKEINT1(a) (#@ a) +-#define sscanf sscanf_s +-#else +-#define mychar0 '0' +-#define mychar1 '1' +-#define mychar2 '2' +-#define mychar3 '3' +-#define mychara 'a' +-#define mycharb 'b' +-#define mycharc 'c' +-#define mychard 'd' +-#define mychare 'e' +-#define mycharf 'f' +-#define mycharg 'g' +-#define mycharh 'h' +-#define mychari 'i' +-#define mycharj 'j' +-#define mychark 'k' +-#define mycharl 'l' +-#define mycharm 'm' +-#define mycharn 'n' +-#define mycharo 'o' +-#define mycharp 'p' +-#define mycharq 'q' +-#define mycharr 'r' +-#define mychars 's' +-#define mychart 't' +-#define mycharu 'u' +-#define mycharv 'v' +-#define mycharw 'w' +-#define mycharx 'x' +-#define mychary 'y' +-#define mycharz 'z' +-#define MAKEINT1(a) (mychar##a) +-#endif +-#define MAKEINT2(a, b) (MAKEINT1(a) + (MAKEINT1(b) << 8)) +-#define MAKEINT3(a, b, c) (MAKEINT1(a) + (MAKEINT2(b, c) << 8)) +-#define MAKEINT4(a, b, c, d) (MAKEINT1(a) + (MAKEINT3(b, c, d) << 8)) +- +- char *arg, *param, *opt; +- int opti, argi; +- float argf; +- for (int i = 0; i < argc; i++) { +- arg = argv[i]; +- if (arg == NULL || arg[0] != '-' || !arg[1]) continue; +- opt = arg + 1; +- opti = STRING_TO_INT(opt); +- param = argv[i + 1]; +- +- //////////////////////////////// +- switch (opti) { +- case MAKEINT3(l, m, i): +- if (i + 1 < argc && sscanf(param, "%d", &argi) && argi > 0) +- __lm_max_iteration = argi; +- break; +- case MAKEINT3(l, m, d): +- if (i + 1 < argc && sscanf(param, "%f", &argf) && argf >= 0) +- __lm_delta_threshold = argf; +- break; +- case MAKEINT3(l, m, e): +- if (i + 1 < argc && sscanf(param, "%f", &argf) && argf >= 0) +- __lm_mse_threshold = argf; +- break; +- case MAKEINT3(l, m, g): +- if (i + 1 < argc && sscanf(param, "%f", &argf) && argf > 0) +- __lm_gradient_threshold = argf; +- break; +- case MAKEINT4(d, a, m, p): +- if (i + 1 < argc && sscanf(param, "%f", &argf) && argf > 0) +- __lm_initial_damp = argf; +- break; +- case MAKEINT4(d, m, i, n): +- if (i + 1 < argc && sscanf(param, "%f", &argf) && argf > 0) +- __lm_minimum_damp = argf; +- break; +- case MAKEINT4(d, m, a, x): +- if (i + 1 < argc && sscanf(param, "%f", &argf) && argf > 0) +- __lm_maximum_damp = argf; +- break; +- case MAKEINT3(c, g, i): +- if (i + 1 < argc && sscanf(param, "%d", &argi) && argi > 0) +- __cg_max_iteration = argi; +- break; +- case MAKEINT4(c, g, i, m): +- if (i + 1 < argc && sscanf(param, "%d", &argi) && argi > 0) +- __cg_min_iteration = argi; +- break; +- case MAKEINT3(c, g, n): +- if (i + 1 < argc && sscanf(param, "%f", &argf) && argf > 0) +- __cg_norm_threshold = argf; +- break; +- case MAKEINT3(c, g, g): +- if (i + 1 < argc && sscanf(param, "%f", &argf) && argf > 0) +- __cg_norm_guard = argf; +- break; +- case MAKEINT4(c, g, r, f): +- if (i + 1 < argc && sscanf(param, "%d", &argi) && argi > 0) +- __cg_recalculate_freq = argi; +- break; +- case MAKEINT1(v): +- if (i + 1 < argc && sscanf(param, "%d", &argi) && argi >= 0) +- __verbose_level = argi; +- break; +- case MAKEINT4(d, e, v, i): +- if (i + 1 < argc && sscanf(param, "%d", &argi) && argi >= 0) +- __selected_device = argi; +- break; +- case MAKEINT4(b, u, d, g): +- if (i + 1 < argc && sscanf(param, "%d", &argi) && argi >= 0) +- __bundle_time_budget = argi; +- break; +- case MAKEINT3(e, x, p): +- if (i + 1 < argc && sscanf(param, "%d", &argi) && argi >= 0) +- __pba_experimental = argi; +- break; +- case MAKEINT4(t, n, u, m): +- if (i + 1 < argc && sscanf(param, "%d", &argi) && argi > 0) +- __num_cpu_thread_all = argi; +- break; +- case MAKEINT4(p, r, o, f): +- __profile_pba = (i + 1 < argc && sscanf(param, "%d", &argi)) +- ? std::max(10, argi) +- : 100; +- break; +- case MAKEINT4(t, p, r, o): +- __cpu_thread_profile = true; +- break; +- case MAKEINT4(c, a, l, i): +- __fixed_intrinsics = true; +- break; +- case MAKEINT4(s, c, h, u): +- case MAKEINT4(s, s, o, r): +- __cg_schur_complement = true; +- break; +- case MAKEINT2(m, d): +- case MAKEINT4(r, a, d, i): +- __use_radial_distortion = -1; +- break; +- case MAKEINT2(p, d): +- __use_radial_distortion = 1; +- break; +- case MAKEINT3(r, 0, 0): +- __reset_initial_distortion = true; +- break; +- case MAKEINT4(v, a, r, i): +- __fixed_intrinsics = false; +- break; +- case MAKEINT4(n, a, c, c): +- __accurate_gain_ratio = false; +- break; +- case MAKEINT4(v, c, g, i): +- __verbose_cg_iteration = true; +- break; +- case MAKEINT4(v, f, u, n): +- __verbose_function_time = true; +- break; +- case MAKEINT4(v, a, l, l): +- __verbose_allocation = true; +- break; +- case MAKEINT4(v, s, s, e): +- __verbose_sse = true; +- break; +- case MAKEINT4(s, v, g, n): +- __save_gradient_norm = true; +- break; +- case MAKEINT2(i, d): +- __lm_use_diagonal_damp = false; +- break; +- case MAKEINT3(d, a, s): +- if (i + 1 < argc && sscanf(param, "%f", &argf) && argf > 0) +- __lm_damping_auto_switch = std::max(argf, 0.1f); +- else +- __lm_damping_auto_switch = 2.0f; +- break; +- case MAKEINT4(c, h, k, g): +- __lm_check_gradient = true; +- break; +- case MAKEINT4(n, o, j, n): +- __jacobian_normalize = false; +- break; +- case MAKEINT2(n, j): +- __no_jacobian_store = true; +- case MAKEINT3(n, j, c): +- __jc_store_transpose = false; +- __jc_store_original = false; +- break; +- case MAKEINT4(n, j, c, o): +- __jc_store_original = false; +- break; +- case MAKEINT4(n, j, c, t): +- __jc_store_transpose = false; +- break; +- case MAKEINT3(j, x, j): +- __multiply_jx_usenoj = false; +- break; +- case MAKEINT4(j, x, n, j): +- __multiply_jx_usenoj = true; +- break; +- case MAKEINT4(n, o, d, n): +- __depth_normalize = false; +- __focal_normalize = false; +- break; +- case MAKEINT4(n, o, d, f): +- __depth_degeneracy_fix = false; +- break; +- case MAKEINT4(n, o, r, m): +- if (i + 1 < argc && sscanf(param, "%f", &argf) && argf > 0) +- __data_normalize_median = argf; +- break; +- case MAKEINT3(d, c, e): +- if (i + 1 < argc && sscanf(param, "%f", &argf) && argf > 0 && +- argf <= 0.01) +- __depth_check_epsilon = argf; +- break; +- case MAKEINT4(d, e, b, u): +- __debug_pba = true; +- break; +- case MAKEINT4(e, v, a, l): +- __lm_max_iteration = 100; +- __warmup_device = true; +- case MAKEINT4(s, t, a, t): +- __stat_filename = (i + 1 < argc && param[0] != '-') ? param : NULL; +- break; +- case MAKEINT3(o, u, t): +- __driver_output = (i + 1 < argc && param[0] != '-') ? param : NULL; +- break; +- case MAKEINT4(w, a, r, m): +- __warmup_device = true; +- break; +- case MAKEINT4(m, o, t, i): +- __bundle_mode_next = 1; +- break; +- case MAKEINT4(s, t, r, u): +- __bundle_mode_next = 2; +- break; +- } +- } +-} +- +-} // namespace pba +diff --git a/lib/PBA/ConfigBA.h b/lib/PBA/ConfigBA.h +deleted file mode 100644 +index 74bd52439..000000000 +--- a/lib/PBA/ConfigBA.h ++++ /dev/null +@@ -1,226 +0,0 @@ +-//////////////////////////////////////////////////////////////////////////// +-// File: ConfigBA.h +-// Author: Changchang Wu (ccwu@cs.washington.edu) +-// Description : configuration object class +-// +-// Copyright (c) 2011 Changchang Wu (ccwu@cs.washington.edu) +-// and the University of Washington at Seattle +-// +-// This library is free software; you can redistribute it and/or +-// modify it under the terms of the GNU General Public +-// License as published by the Free Software Foundation; either +-// Version 3 of the License, or (at your option) any later version. +-// +-// This library is distributed in the hope that it will be useful, +-// but WITHOUT ANY WARRANTY; without even the implied warranty of +-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +-// General Public License for more details. +-// +-//////////////////////////////////////////////////////////////////////////////// +- +-#ifndef CONFIG_BA_H +-#define CONFIG_BA_H +-#include +- +-namespace pba { +- +-class ConfigBA { +- protected: +- enum { +- TIMER_OVERALL = 0, +- TIMER_OPTIMIZATION, +- TIMER_GPU_ALLOCATION, +- TIMER_GPU_UPLOAD, +- TIMER_PREPROCESSING, +- TIMER_GPU_DOWNLOAD, +- TIMER_CG_ITERATION, +- TIMER_LM_ITERATION, +- TIMER_FUNCTION_JJ, +- TIMER_FUNCTION_PJ, +- TIMER_FUNCTION_DD, +- TIMER_FUNCTION_JX, +- TIMER_FUNCTION_JTE, +- TIMER_FUNCTION_BC, +- TIMER_FUNCTION_MP, +- TIMER_FUNCTION_UP, +- TIMER_PROFILE_STEP, +- NUM_TIMER, +- FUNC_JX = 0, +- FUNC_JX_, +- FUNC_JTEC_JCT, +- FUNC_JTEC_JCO, +- FUNC_JTEP, +- FUNC_JTE_, +- FUNC_JJ_JCO_JCT_JP, +- FUNC_JJ_JCO_JP, +- FUNC_JJ_JCT_JP, +- FUNC_JJ_JP, +- FUNC_PJ, +- FUNC_BCC_JCT, +- FUNC_BCC_JCO, +- FUNC_BCP, +- FUNC_MPC, +- FUNC_MPP, +- FUNC_VS, +- FUNC_VV, +- NUM_FUNC +- }; +- class TimerBA { +- ConfigBA* _config; +- int _timer; +- +- public: +- TimerBA(ConfigBA* config, int timer) { +- (_config = config)->BundleTimerStart(_timer = timer); +- } +- TimerBA(ConfigBA* config, int timer, bool) { +- (_config = config)->BundleTimerSwitch(_timer = timer); +- } +- ~TimerBA() { _config->BundleTimerSwitch(_timer); } +- }; +- friend class TimerBA; +- +- public: +- ////////////////////////////// +- int __lm_max_iteration; //(default 50) +- int __cg_max_iteration; //(default 100) +- int __cg_min_iteration; //(default 10) +- int __cg_recalculate_freq; //(default 0) +- bool __accurate_gain_ratio; //(default true) accurate gain ratio for +- //approximate solutions +- +- ////////////////////////////// +- float __lm_delta_threshold; //(default 1e-6)|dx|_2, I use absolute (not +- //relative) change +- float __lm_gradient_threshold; //(default 1e-10)|Jt * e|_inf +- float __lm_mse_threshold; //(default 0.25) quit if MSE is equal to or smaller +- //than this +- float __lm_initial_damp; //(default 0.001)initial damping factor +- float __lm_minimum_damp; //(default 1e-10)minimum damping factor +- float __lm_maximum_damp; +- float __cg_norm_threshold; //(default 0.1)terminate CG if norm ratio is less +- //than threshold +- float __cg_norm_guard; //(default 1.0)abort cg when norm increases to +- int __pba_experimental; +- bool __cg_schur_complement; +- +- ////////////////////////////// +- bool __lm_check_gradient; //(default false) check g_inf for convergence +- float __lm_damping_auto_switch; +- bool __lm_use_diagonal_damp; //(default true)use (Jt*J + lambda * diag(Jt*J)) +- //= Jt * e +- // or use (Jt*J + lambda * I) = Jt * e +- bool __fixed_intrinsics; //(default false) set true for calibrated camera +- //system +- int __use_radial_distortion; //(default 0, 1 for projection distortion, 2 for +- //measurement distortion) +- bool __reset_initial_distortion; //(default false) reset the initial +- //distortio to 0 +- +- //////////////////////////// +- int __verbose_level; //(default 2) how many messages to print out +- bool __abort_flag; //(default false)abort the bundle adjustment loop if set +- //true +- bool __verbose_cg_iteration; //(default false)print out details of Conjugate +- //Gradients +- bool __verbose_function_time; //(default false)print timing of some key +- //functions +- bool __save_gradient_norm; //(default false)save |Jt * e|_2 of each iteration +- bool __verbose_allocation; //(default false)whether print out allocation +- //details +- bool __verbose_sse; //(default false) show mse or sse +- +- /////////////////////////////////// +- bool __jc_store_transpose; //(default true) whether store transpose of JC +- bool __no_jacobian_store; //(default false) whether use memory saving mode +- bool __jc_store_original; //(default true) whether store original JC +- +- /////////////////////////////////// +- bool __jacobian_normalize; //(default true) scaling the jacobians according +- //to initial jacobians +- bool __focal_normalize; //(default true) data normalization +- bool __depth_normalize; //(default true) data normalization +- bool __depth_degeneracy_fix; +- float __data_normalize_median; +- float __depth_check_epsilon; +- ///////////////////////////// +- +- protected: +- bool __multiply_jx_usenoj; // for debug purpose +- protected: +- ///////////////////////////// +- int __selected_device; +- int __cpu_data_precision; +- int __bundle_time_budget; +- int __bundle_mode_next; +- int __bundle_current_mode; +- ////////////////////////////// +- float __initial_mse; +- float __final_mse; +- float __final_mse_x; +- float __focal_scaling; +- float __depth_scaling; +- int __current_device; +- int __current_iteration; +- int __num_cg_iteration; +- int __num_lm_success; +- int __num_lm_iteration; +- int __num_projection_eval; +- int __num_jacobian_eval; +- int __num_camera_modified; +- int __num_point_behind; +- int __pba_return_code; +- int __recent_cg_status; +- int __profile_pba; +- bool __cpu_thread_profile; +- bool __debug_pba; +- bool __warmup_device; +- size_t __memory_usage; +- ///////////////////////////////////// +- bool __matlab_format_stat; +- char* __stat_filename; +- const char* __driver_output; +- std::vector __bundle_records; +- double __timer_record[NUM_TIMER]; +- int __num_cpu_thread_all; +- int __num_cpu_thread[NUM_FUNC]; +- +- protected: +- ConfigBA(); +- /////////////////////////////// +- void ResetTemporarySetting(); +- void ResetBundleStatistics(); +- void PrintBundleStatistics(); +- void SaveBundleStatistics(int ncam, int npt, int nproj); +- /////////////////////////////////////// +- void BundleTimerStart(int timer); +- void BundleTimerSwitch(int timer); +- float BundleTimerGet(int timer); +- void BundleTimerSwap(int timer1, int timer2); +- float BundleTimerGetNow(int timer = TIMER_OPTIMIZATION); +- ///////////////////////////////// +- void SaveBundleRecord(int iter, float res, float damping, float gn, float gi); +- bool IsTimeBudgetAvailable(); +- double MyClock(); +- +- public: +- void ParseParam(int argc, char** argv); +- +- public: +- // the following are to be called after finishing BA +- const char* GetOutputParam() { return __driver_output; } +- float GetInitialMSE() { return __initial_mse; } +- float GetFinalMSE() { return __final_mse; } +- double GetBundleTiming(int timer = TIMER_OVERALL) { +- return __timer_record[timer]; +- } +- int GetIterationsLM() { return __num_lm_iteration; } +- int GetIterationsCG() { return __num_cg_iteration; } +- int GetCurrentDevice() { return __current_device; } +- int GetBundleReturnCode() { return __pba_return_code; } +- int GetActiveDevice() { return __selected_device; } +-}; +- +-} // namespace pba +- +-#endif +diff --git a/lib/PBA/CuTexImage.cpp b/lib/PBA/CuTexImage.cpp +deleted file mode 100644 +index 400a0f3..0000000 +--- a/lib/PBA/CuTexImage.cpp ++++ /dev/null +@@ -1,137 +0,0 @@ +-//////////////////////////////////////////////////////////////////////////// +-// File: CuTexImage.cpp +-// Author: Changchang Wu +-// Description : implementation of the CuTexImage class. +-// +-// Copyright (c) 2011 Changchang Wu (ccwu@cs.washington.edu) +-// and the University of Washington at Seattle +-// +-// This library is free software; you can redistribute it and/or +-// modify it under the terms of the GNU General Public +-// License as published by the Free Software Foundation; either +-// Version 3 of the License, or (at your option) any later version. +-// +-// This library is distributed in the hope that it will be useful, +-// but WITHOUT ANY WARRANTY; without even the implied warranty of +-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +-// General Public License for more details. +-// +-//////////////////////////////////////////////////////////////////////////////// +- +-#include +-#include +-#include +-#include +-#include +-using namespace std; +- +-#include +-#include +-#include "CuTexImage.h" +- +-#if CUDA_VERSION <= 2010 +-#error "Require CUDA 2.2 or higher" +-#endif +- +-namespace pba { +- +-CuTexImage::CuTexImage() { +- _owner = true; +- _cuData = NULL; +- _numBytes = _numChannel = 0; +- _imgWidth = _imgHeight = 0; +-} +- +-CuTexImage::~CuTexImage() { +- if (_cuData && _owner) cudaFree(_cuData); +-} +- +-void CuTexImage::ReleaseData() { +- if (_cuData && _owner) cudaFree(_cuData); +- _cuData = NULL; +- _numBytes = 0; +-} +- +-void CuTexImage::SwapData(CuTexImage& src) { +- if (_cuData == src._cuData) return; +- +- void* cuData = _cuData; +- unsigned int numChannel = _numChannel; +- unsigned int imgWidth = _imgWidth; +- unsigned int imgHeight = _imgHeight; +- bool owner = _owner; +- size_t numBytes = _numBytes; +- +- _cuData = src._cuData; +- _numChannel = src._numChannel; +- _numBytes = src._numBytes; +- _imgWidth = src._imgWidth; +- _imgHeight = src._imgHeight; +- _owner = src._owner; +- +- src._cuData = cuData; +- src._numChannel = numChannel; +- src._numBytes = numBytes; +- src._imgWidth = imgWidth; +- src._imgHeight = imgHeight; +- src._owner = owner; +-} +- +-bool CuTexImage::InitTexture(unsigned int width, unsigned int height, +- unsigned int nchannel) { +- size_t size = sizeof(float) * width * height * nchannel; +- _imgWidth = width; +- _imgHeight = height; +- _numChannel = nchannel; +- +- if (size <= _numBytes) return true; +- +- if (_cuData && _owner) cudaFree(_cuData); +- +- // allocate the array data +- cudaError_t e = cudaMalloc(&_cuData, size); +- _numBytes = e == cudaSuccess ? size : 0; +- _owner = true; +- return e == cudaSuccess; +-} +- +-void CuTexImage::SetTexture(void* data, unsigned int width, +- unsigned int nchannel) { +- if (_cuData && _owner) cudaFree(_cuData); +- _imgWidth = width; +- _imgHeight = 1; +- _numChannel = nchannel; +- _numBytes = sizeof(float) * width * _imgHeight * _numChannel; +- _cuData = data; +- _owner = false; +-} +- +-void CuTexImage::CopyFromHost(const void* buf) { +- if (_cuData == NULL || buf == NULL || GetDataSize() == 0) return; +- cudaMemcpy(_cuData, buf, _imgWidth * _imgHeight * _numChannel * sizeof(float), +- cudaMemcpyHostToDevice); +-} +- +-void CuTexImage::CopyFromDevice(const void* buf) { +- if (_cuData == NULL) return; +- cudaMemcpy((char*)_cuData, buf, +- _imgWidth * _imgHeight * _numChannel * sizeof(float), +- cudaMemcpyDeviceToDevice); +-} +- +-void CuTexImage::CopyToHost(void* buf) { +- if (_cuData == NULL) return; +- size_t sz = _imgWidth * _imgHeight * _numChannel * sizeof(float); +- // cudaThreadSynchronize(); +- cudaMemcpy(buf, _cuData, sz, cudaMemcpyDeviceToHost); +- cudaThreadSynchronize(); +-} +- +-void CuTexImage::SaveToFile(const char* name) { +- ofstream out(name); +- vector value(GetLength()); +- CopyToHost(&value[0]); +- for (size_t i = 0; i < value.size(); ++i) out << value[i] << '\n'; +-} +- +-} // namespace pba +diff --git a/lib/PBA/CuTexImage.h b/lib/PBA/CuTexImage.h +deleted file mode 100644 +index e53e566e7..000000000 +--- a/lib/PBA/CuTexImage.h ++++ /dev/null +@@ -1,83 +0,0 @@ +-//////////////////////////////////////////////////////////////////////////// +-// File: CuTexImage.h +-// Author: Changchang Wu +-// Description : interface for the CuTexImage class. +-// class for storing data in CUDA. +-// +-// Copyright (c) 2011 Changchang Wu (ccwu@cs.washington.edu) +-// and the University of Washington at Seattle +-// +-// This library is free software; you can redistribute it and/or +-// modify it under the terms of the GNU General Public +-// License as published by the Free Software Foundation; either +-// Version 3 of the License, or (at your option) any later version. +-// +-// This library is distributed in the hope that it will be useful, +-// but WITHOUT ANY WARRANTY; without even the implied warranty of +-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +-// General Public License for more details. +-// +-//////////////////////////////////////////////////////////////////////////////// +- +-#ifndef CU_TEX_IMAGE_H +-#define CU_TEX_IMAGE_H +- +-struct textureReference; +- +-namespace pba { +- +-class CuTexImage { +- protected: +- bool _owner; +- void* _cuData; +- unsigned int _numChannel; +- unsigned int _imgWidth; +- unsigned int _imgHeight; +- size_t _numBytes; +- +- public: +- bool InitTexture(unsigned int width, unsigned int height, +- unsigned int nchannel = 1); +- void SetTexture(void* data, unsigned int width, unsigned int nchannel = 1); +- void BindTexture(textureReference& texRef); +- void BindTexture(textureReference& texRef, int offset, size_t size); +- void BindTexture2(textureReference& texRef1, textureReference& texRef2); +- void BindTexture4(textureReference& texRef1, textureReference& texRef2, +- textureReference& texRef3, textureReference& texRef4); +- int BindTextureX(textureReference& texRef1, textureReference& texRef2, +- textureReference& texRef3, textureReference& texRef4, +- bool force4); +- void SwapData(CuTexImage& src); +- void CopyToHost(void* buf); +- void CopyFromDevice(const void* buf); +- void CopyFromHost(const void* buf); +- void SaveToFile(const char* name); +- void ReleaseData(); +- +- public: +- inline float* data() { return GetRequiredSize() ? ((float*)_cuData) : NULL; } +- inline bool IsValid() { return _cuData != NULL && GetDataSize() > 0; } +- inline unsigned int GetLength() { +- return _imgWidth * _imgHeight * _numChannel; +- } +- inline unsigned int GetImgWidth() { return _imgWidth; } +- inline unsigned int GetImgHeight() { return _imgHeight; } +- inline size_t GetReservedWidth() { +- return _numBytes == 0 +- ? 0 +- : (_numBytes / (_imgHeight * _numChannel * sizeof(float))); +- } +- inline size_t GetDataSize() { return _numBytes == 0 ? 0 : GetRequiredSize(); } +- inline size_t GetRequiredSize() { +- return sizeof(float) * _imgWidth * _imgHeight * _numChannel; +- } +- inline unsigned int IsHugeData() { return (GetLength() - 1) / (1 << 27); } +- +- public: +- CuTexImage(); +- virtual ~CuTexImage(); +-}; +- +-} // namespace pba +- +-#endif // !defined(CU_TEX_IMAGE_H) +diff --git a/lib/PBA/DataInterface.h b/lib/PBA/DataInterface.h +deleted file mode 100644 +index b465bd60a..000000000 +--- a/lib/PBA/DataInterface.h ++++ /dev/null +@@ -1,423 +0,0 @@ +-//////////////////////////////////////////////////////////////////////////// +-// File: DataInterface.h +-// Author: Changchang Wu (ccwu@cs.washington.edu) +-// Description : data interface, the data format been uploaded to GPU +-// +-// Copyright (c) 2011 Changchang Wu (ccwu@cs.washington.edu) +-// and the University of Washington at Seattle +-// +-// This library is free software; you can redistribute it and/or +-// modify it under the terms of the GNU General Public +-// License as published by the Free Software Foundation; either +-// Version 3 of the License, or (at your option) any later version. +-// +-// This library is distributed in the hope that it will be useful, +-// but WITHOUT ANY WARRANTY; without even the implied warranty of +-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +-// General Public License for more details. +-// +-//////////////////////////////////////////////////////////////////////////////// +- +-#ifndef DATA_INTERFACE_GPU_H +-#define DATA_INTERFACE_GPU_H +- +-#include +- +-// ----------------------------WARNING------------------------------ +-// ----------------------------------------------------------------- +-// ROTATION CONVERSION: +-// The internal rotation representation is 3x3 float matrix. Reading +-// back the rotations as quaternion or Rodrigues's representation will +-// cause inaccuracy, IF you have wrongly reconstructed cameras with +-// a very very large focal length (typically also very far away). +-// In this case, any small change in the rotation matrix, will cause +-// a large reprojection error. +-// +-// --------------------------------------------------------------------- +-// RADIAL distortion is NOT enabled by default, use parameter "-md", -pd" +-// or set ConfigBA::__use_radial_distortion to 1 or -1 to enable it. +-// --------------------------------------------------------------------------- +- +-namespace pba { +- +-// transfer data type with 4-float alignment +-#define CameraT CameraT_ +-#define Point3D Point3D_ +-template +- +-struct CameraT_ { +- typedef FT float_t; +- ////////////////////////////////////////////////////// +- float_t f; // single focal length, K = [f, 0, 0; 0 f 0; 0 0 1] +- float_t t[3]; // T in P = K[R T], T = - RC +- float_t m[3][3]; // R in P = K[R T]. +- float_t radial; // WARNING: BE careful with the radial distortion model. +- int distortion_type; +- float_t constant_camera; +- +- ////////////////////////////////////////////////////////// +- CameraT_() { +- radial = 0; +- distortion_type = 0; +- constant_camera = 0; +- } +- +- ////////////////////////////////////////////// +- template +- void SetCameraT(const CameraX& cam) { +- f = (float_t)cam.f; +- t[0] = (float_t)cam.t[0]; +- t[1] = (float_t)cam.t[1]; +- t[2] = (float_t)cam.t[2]; +- for (int i = 0; i < 3; ++i) +- for (int j = 0; j < 3; ++j) m[i][j] = (float_t)cam.m[i][j]; +- radial = (float_t)cam.radial; +- distortion_type = cam.distortion_type; +- constant_camera = cam.constant_camera; +- } +- +- ////////////////////////////////////////// +- void SetConstantCamera() { constant_camera = 1.0f; } +- void SetVariableCamera() { constant_camera = 0.0f; } +- void SetFixedIntrinsic() { constant_camera = 2.0f; } +- // void SetFixedExtrinsic() {constant_camera = 3.0f;} +- +- ////////////////////////////////////// +- template +- void SetFocalLength(Float F) { +- f = (float_t)F; +- } +- float_t GetFocalLength() const { return f; } +- +- template +- void SetMeasurementDistortion(Float r) { +- radial = (float_t)r; +- distortion_type = -1; +- } +- float_t GetMeasurementDistortion() const { +- return distortion_type == -1 ? radial : 0; +- } +- +- // normalize radial distortion that applies to angle will be (radial * f * f); +- template +- void SetNormalizedMeasurementDistortion(Float r) { +- SetMeasurementDistortion(r / (f * f)); +- } +- float_t GetNormalizedMeasurementDistortion() const { +- return GetMeasurementDistortion() * (f * f); +- } +- +- // use projection distortion +- template +- void SetProjectionDistortion(Float r) { +- radial = float_t(r); +- distortion_type = 1; +- } +- template +- void SetProjectionDistortion(const Float* r) { +- SetProjectionDistortion(r[0]); +- } +- float_t GetProjectionDistortion() const { +- return distortion_type == 1 ? radial : 0; +- } +- +- template +- void SetRodriguesRotation(const Float r[3]) { +- double a = sqrt(r[0] * r[0] + r[1] * r[1] + r[2] * r[2]); +- double ct = a == 0.0 ? 0.5 : (1.0 - cos(a)) / a / a; +- double st = a == 0.0 ? 1 : sin(a) / a; +- m[0][0] = float_t(1.0 - (r[1] * r[1] + r[2] * r[2]) * ct); +- m[0][1] = float_t(r[0] * r[1] * ct - r[2] * st); +- m[0][2] = float_t(r[2] * r[0] * ct + r[1] * st); +- m[1][0] = float_t(r[0] * r[1] * ct + r[2] * st); +- m[1][1] = float_t(1.0 - (r[2] * r[2] + r[0] * r[0]) * ct); +- m[1][2] = float_t(r[1] * r[2] * ct - r[0] * st); +- m[2][0] = float_t(r[2] * r[0] * ct - r[1] * st); +- m[2][1] = float_t(r[1] * r[2] * ct + r[0] * st); +- m[2][2] = float_t(1.0 - (r[0] * r[0] + r[1] * r[1]) * ct); +- } +- template +- void GetRodriguesRotation(Float r[3]) const { +- double a = (m[0][0] + m[1][1] + m[2][2] - 1.0) / 2.0; +- const double epsilon = 0.01; +- if (fabs(m[0][1] - m[1][0]) < epsilon && +- fabs(m[1][2] - m[2][1]) < epsilon && +- fabs(m[0][2] - m[2][0]) < epsilon) { +- if (fabs(m[0][1] + m[1][0]) < 0.1 && fabs(m[1][2] + m[2][1]) < 0.1 && +- fabs(m[0][2] + m[2][0]) < 0.1 && a > 0.9) { +- r[0] = 0; +- r[1] = 0; +- r[2] = 0; +- } else { +- const Float ha = Float(sqrt(0.5) * 3.14159265358979323846); +- double xx = (m[0][0] + 1.0) / 2.0; +- double yy = (m[1][1] + 1.0) / 2.0; +- double zz = (m[2][2] + 1.0) / 2.0; +- double xy = (m[0][1] + m[1][0]) / 4.0; +- double xz = (m[0][2] + m[2][0]) / 4.0; +- double yz = (m[1][2] + m[2][1]) / 4.0; +- +- if ((xx > yy) && (xx > zz)) { +- if (xx < epsilon) { +- r[0] = 0; +- r[1] = r[2] = ha; +- } else { +- double t = sqrt(xx); +- r[0] = Float(t * 3.14159265358979323846); +- r[1] = Float(xy / t * 3.14159265358979323846); +- r[2] = Float(xz / t * 3.14159265358979323846); +- } +- } else if (yy > zz) { +- if (yy < epsilon) { +- r[0] = r[2] = ha; +- r[1] = 0; +- } else { +- double t = sqrt(yy); +- r[0] = Float(xy / t * 3.14159265358979323846); +- r[1] = Float(t * 3.14159265358979323846); +- r[2] = Float(yz / t * 3.14159265358979323846); +- } +- } else { +- if (zz < epsilon) { +- r[0] = r[1] = ha; +- r[2] = 0; +- } else { +- double t = sqrt(zz); +- r[0] = Float(xz / t * 3.14159265358979323846); +- r[1] = Float(yz / t * 3.14159265358979323846); +- r[2] = Float(t * 3.14159265358979323846); +- } +- } +- } +- } else { +- a = acos(a); +- double b = 0.5 * a / sin(a); +- r[0] = Float(b * (m[2][1] - m[1][2])); +- r[1] = Float(b * (m[0][2] - m[2][0])); +- r[2] = Float(b * (m[1][0] - m[0][1])); +- } +- } +- //////////////////////// +- template +- void SetQuaternionRotation(const Float q[4]) { +- double qq = sqrt(q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3]); +- double qw, qx, qy, qz; +- if (qq > 0) { +- qw = q[0] / qq; +- qx = q[1] / qq; +- qy = q[2] / qq; +- qz = q[3] / qq; +- } else { +- qw = 1; +- qx = qy = qz = 0; +- } +- m[0][0] = float_t(qw * qw + qx * qx - qz * qz - qy * qy); +- m[0][1] = float_t(2 * qx * qy - 2 * qz * qw); +- m[0][2] = float_t(2 * qy * qw + 2 * qz * qx); +- m[1][0] = float_t(2 * qx * qy + 2 * qw * qz); +- m[1][1] = float_t(qy * qy + qw * qw - qz * qz - qx * qx); +- m[1][2] = float_t(2 * qz * qy - 2 * qx * qw); +- m[2][0] = float_t(2 * qx * qz - 2 * qy * qw); +- m[2][1] = float_t(2 * qy * qz + 2 * qw * qx); +- m[2][2] = float_t(qz * qz + qw * qw - qy * qy - qx * qx); +- } +- template +- void GetQuaternionRotation(Float q[4]) const { +- q[0] = 1 + m[0][0] + m[1][1] + m[2][2]; +- if (q[0] > 0.000000001) { +- q[0] = sqrt(q[0]) / 2.0; +- q[1] = (m[2][1] - m[1][2]) / (4.0 * q[0]); +- q[2] = (m[0][2] - m[2][0]) / (4.0 * q[0]); +- q[3] = (m[1][0] - m[0][1]) / (4.0 * q[0]); +- } else { +- double s; +- if (m[0][0] > m[1][1] && m[0][0] > m[2][2]) { +- s = 2.0 * sqrt(1.0 + m[0][0] - m[1][1] - m[2][2]); +- q[1] = 0.25 * s; +- q[2] = (m[0][1] + m[1][0]) / s; +- q[3] = (m[0][2] + m[2][0]) / s; +- q[0] = (m[1][2] - m[2][1]) / s; +- } else if (m[1][1] > m[2][2]) { +- s = 2.0 * sqrt(1.0 + m[1][1] - m[0][0] - m[2][2]); +- q[1] = (m[0][1] + m[1][0]) / s; +- q[2] = 0.25 * s; +- q[3] = (m[1][2] + m[2][1]) / s; +- q[0] = (m[0][2] - m[2][0]) / s; +- } else { +- s = 2.0 * sqrt(1.0 + m[2][2] - m[0][0] - m[1][1]); +- q[1] = (m[0][2] + m[2][0]) / s; +- q[2] = (m[1][2] + m[2][1]) / s; +- q[3] = 0.25f * s; +- q[0] = (m[0][1] - m[1][0]) / s; +- } +- } +- } +- //////////////////////////////////////////////// +- template +- void SetMatrixRotation(const Float* r) { +- int k = 0; +- for (int i = 0; i < 3; ++i) { +- for (int j = 0; j < 3; ++j) { +- m[i][j] = float_t(r[k++]); +- } +- } +- } +- template +- void GetMatrixRotation(Float* r) const { +- int k = 0; +- for (int i = 0; i < 3; ++i) { +- for (int j = 0; j < 3; ++j) { +- r[k++] = Float(m[i][j]); +- } +- } +- } +- float GetRotationMatrixDeterminant() const { +- return m[0][0] * m[1][1] * m[2][2] + m[0][1] * m[1][2] * m[2][0] + +- m[0][2] * m[1][0] * m[2][1] - m[0][2] * m[1][1] * m[2][0] - +- m[0][1] * m[1][0] * m[2][2] - m[0][0] * m[1][2] * m[2][1]; +- } +- /////////////////////////////////////// +- template +- void SetTranslation(const Float T[3]) { +- t[0] = (float_t)T[0]; +- t[1] = (float_t)T[1]; +- t[2] = (float_t)T[2]; +- } +- template +- void GetTranslation(Float T[3]) const { +- T[0] = (Float)t[0]; +- T[1] = (Float)t[1]; +- T[2] = (Float)t[2]; +- } +- ///////////////////////////////////////////// +- template +- void SetCameraCenterAfterRotation(const Float c[3]) { +- // t = - R * C +- for (int j = 0; j < 3; ++j) +- t[j] = -float_t(m[j][0] * c[0] + m[j][1] * c[1] + m[j][2] * c[2]); +- } +- template +- void GetCameraCenter(Float c[3]) { +- // C = - R' * t +- for (int j = 0; j < 3; ++j) +- c[j] = -float_t(m[0][j] * t[0] + m[1][j] * t[1] + m[2][j] * t[2]); +- } +- //////////////////////////////////////////// +- template +- void SetInvertedRT(const Float e[3], const Float T[3]) { +- SetRodriguesRotation(e); +- for (int i = 3; i < 9; ++i) m[0][i] = -m[0][i]; +- SetTranslation(T); +- t[1] = -t[1]; +- t[2] = -t[2]; +- } +- +- template +- void GetInvertedRT(Float e[3], Float T[3]) const { +- CameraT ci; +- ci.SetMatrixRotation(m[0]); +- for (int i = 3; i < 9; ++i) ci.m[0][i] = -ci.m[0][i]; +- // for(int i = 1; i < 3; ++i) for(int j = 0; j < 3; ++j) ci.m[i][j] = - +- // ci.m[i][j]; +- ci.GetRodriguesRotation(e); +- GetTranslation(T); +- T[1] = -T[1]; +- T[2] = -T[2]; +- } +- template +- void SetInvertedR9T(const Float e[9], const Float T[3]) { +- // for(int i = 0; i < 9; ++i) m[0][i] = (i < 3 ? e[i] : - e[i]); +- // SetTranslation(T); t[1] = - t[1]; t[2] = -t[2]; +- m[0][0] = e[0]; +- m[0][1] = e[1]; +- m[0][2] = e[2]; +- m[1][0] = -e[3]; +- m[1][1] = -e[4]; +- m[1][2] = -e[5]; +- m[2][0] = -e[6]; +- m[2][1] = -e[7]; +- m[2][2] = -e[8]; +- t[0] = T[0]; +- t[1] = -T[1]; +- t[2] = -T[2]; +- } +- template +- void GetInvertedR9T(Float e[9], Float T[3]) const { +- e[0] = m[0][0]; +- e[1] = m[0][1]; +- e[2] = m[0][2]; +- e[3] = -m[1][0]; +- e[4] = -m[1][1]; +- e[5] = -m[1][2]; +- e[6] = -m[2][0]; +- e[7] = -m[2][1]; +- e[8] = -m[2][2]; +- T[0] = t[0]; +- T[1] = -t[1]; +- T[2] = -t[2]; +- } +-}; +- +-template +-struct Point3D { +- typedef FT float_t; +- float_t xyz[3]; // 3D point location +- float_t reserved; // alignment +- //////////////////////////////// +- template +- void SetPoint(Float x, Float y, Float z) { +- xyz[0] = (float_t)x; +- xyz[1] = (float_t)y; +- xyz[2] = (float_t)z; +- reserved = 0; +- } +- template +- void SetPoint(const Float* p) { +- xyz[0] = (float_t)p[0]; +- xyz[1] = (float_t)p[1]; +- xyz[2] = (float_t)p[2]; +- reserved = 0; +- } +- template +- void GetPoint(Float* p) const { +- p[0] = (Float)xyz[0]; +- p[1] = (Float)xyz[1]; +- p[2] = (Float)xyz[2]; +- } +- template +- void GetPoint(Float& x, Float& y, Float& z) const { +- x = (Float)xyz[0]; +- y = (Float)xyz[1]; +- z = (Float)xyz[2]; +- } +-}; +- +-#undef CameraT +-#undef Point3D +- +-typedef CameraT_ CameraT; +-typedef Point3D_ Point3D; +- +-struct Point2D { +- float x, y; +- //////////////////////////////////////////////////////// +- Point2D() {} +- template +- Point2D(Float X, Float Y) { +- SetPoint2D(X, Y); +- } +- template +- void SetPoint2D(Float X, Float Y) { +- x = (float)X; +- y = (float)Y; +- } +- template +- void GetPoint2D(Float& X, Float& Y) const { +- X = (Float)x; +- Y = (Float)y; +- } +-}; +- +-} // namespace pba +- +-#endif +diff --git a/lib/PBA/LICENSE b/lib/PBA/LICENSE +deleted file mode 100755 +index 94a9ed024..000000000 +--- a/lib/PBA/LICENSE ++++ /dev/null +@@ -1,674 +0,0 @@ +- GNU GENERAL PUBLIC LICENSE +- Version 3, 29 June 2007 +- +- Copyright (C) 2007 Free Software Foundation, Inc. +- Everyone is permitted to copy and distribute verbatim copies +- of this license document, but changing it is not allowed. +- +- Preamble +- +- The GNU General Public License is a free, copyleft license for +-software and other kinds of works. +- +- The licenses for most software and other practical works are designed +-to take away your freedom to share and change the works. By contrast, +-the GNU General Public License is intended to guarantee your freedom to +-share and change all versions of a program--to make sure it remains free +-software for all its users. We, the Free Software Foundation, use the +-GNU General Public License for most of our software; it applies also to +-any other work released this way by its authors. You can apply it to +-your programs, too. +- +- When we speak of free software, we are referring to freedom, not +-price. Our General Public Licenses are designed to make sure that you +-have the freedom to distribute copies of free software (and charge for +-them if you wish), that you receive source code or can get it if you +-want it, that you can change the software or use pieces of it in new +-free programs, and that you know you can do these things. +- +- To protect your rights, we need to prevent others from denying you +-these rights or asking you to surrender the rights. Therefore, you have +-certain responsibilities if you distribute copies of the software, or if +-you modify it: responsibilities to respect the freedom of others. +- +- For example, if you distribute copies of such a program, whether +-gratis or for a fee, you must pass on to the recipients the same +-freedoms that you received. You must make sure that they, too, receive +-or can get the source code. And you must show them these terms so they +-know their rights. +- +- Developers that use the GNU GPL protect your rights with two steps: +-(1) assert copyright on the software, and (2) offer you this License +-giving you legal permission to copy, distribute and/or modify it. +- +- For the developers' and authors' protection, the GPL clearly explains +-that there is no warranty for this free software. For both users' and +-authors' sake, the GPL requires that modified versions be marked as +-changed, so that their problems will not be attributed erroneously to +-authors of previous versions. +- +- Some devices are designed to deny users access to install or run +-modified versions of the software inside them, although the manufacturer +-can do so. This is fundamentally incompatible with the aim of +-protecting users' freedom to change the software. The systematic +-pattern of such abuse occurs in the area of products for individuals to +-use, which is precisely where it is most unacceptable. Therefore, we +-have designed this version of the GPL to prohibit the practice for those +-products. If such problems arise substantially in other domains, we +-stand ready to extend this provision to those domains in future versions +-of the GPL, as needed to protect the freedom of users. +- +- Finally, every program is threatened constantly by software patents. +-States should not allow patents to restrict development and use of +-software on general-purpose computers, but in those that do, we wish to +-avoid the special danger that patents applied to a free program could +-make it effectively proprietary. To prevent this, the GPL assures that +-patents cannot be used to render the program non-free. +- +- The precise terms and conditions for copying, distribution and +-modification follow. +- +- TERMS AND CONDITIONS +- +- 0. Definitions. +- +- "This License" refers to version 3 of the GNU General Public License. +- +- "Copyright" also means copyright-like laws that apply to other kinds of +-works, such as semiconductor masks. +- +- "The Program" refers to any copyrightable work licensed under this +-License. Each licensee is addressed as "you". "Licensees" and +-"recipients" may be individuals or organizations. +- +- To "modify" a work means to copy from or adapt all or part of the work +-in a fashion requiring copyright permission, other than the making of an +-exact copy. The resulting work is called a "modified version" of the +-earlier work or a work "based on" the earlier work. +- +- A "covered work" means either the unmodified Program or a work based +-on the Program. +- +- To "propagate" a work means to do anything with it that, without +-permission, would make you directly or secondarily liable for +-infringement under applicable copyright law, except executing it on a +-computer or modifying a private copy. Propagation includes copying, +-distribution (with or without modification), making available to the +-public, and in some countries other activities as well. +- +- To "convey" a work means any kind of propagation that enables other +-parties to make or receive copies. Mere interaction with a user through +-a computer network, with no transfer of a copy, is not conveying. +- +- An interactive user interface displays "Appropriate Legal Notices" +-to the extent that it includes a convenient and prominently visible +-feature that (1) displays an appropriate copyright notice, and (2) +-tells the user that there is no warranty for the work (except to the +-extent that warranties are provided), that licensees may convey the +-work under this License, and how to view a copy of this License. If +-the interface presents a list of user commands or options, such as a +-menu, a prominent item in the list meets this criterion. +- +- 1. Source Code. +- +- The "source code" for a work means the preferred form of the work +-for making modifications to it. "Object code" means any non-source +-form of a work. +- +- A "Standard Interface" means an interface that either is an official +-standard defined by a recognized standards body, or, in the case of +-interfaces specified for a particular programming language, one that +-is widely used among developers working in that language. +- +- The "System Libraries" of an executable work include anything, other +-than the work as a whole, that (a) is included in the normal form of +-packaging a Major Component, but which is not part of that Major +-Component, and (b) serves only to enable use of the work with that +-Major Component, or to implement a Standard Interface for which an +-implementation is available to the public in source code form. A +-"Major Component", in this context, means a major essential component +-(kernel, window system, and so on) of the specific operating system +-(if any) on which the executable work runs, or a compiler used to +-produce the work, or an object code interpreter used to run it. +- +- The "Corresponding Source" for a work in object code form means all +-the source code needed to generate, install, and (for an executable +-work) run the object code and to modify the work, including scripts to +-control those activities. However, it does not include the work's +-System Libraries, or general-purpose tools or generally available free +-programs which are used unmodified in performing those activities but +-which are not part of the work. For example, Corresponding Source +-includes interface definition files associated with source files for +-the work, and the source code for shared libraries and dynamically +-linked subprograms that the work is specifically designed to require, +-such as by intimate data communication or control flow between those +-subprograms and other parts of the work. +- +- The Corresponding Source need not include anything that users +-can regenerate automatically from other parts of the Corresponding +-Source. +- +- The Corresponding Source for a work in source code form is that +-same work. +- +- 2. Basic Permissions. +- +- All rights granted under this License are granted for the term of +-copyright on the Program, and are irrevocable provided the stated +-conditions are met. This License explicitly affirms your unlimited +-permission to run the unmodified Program. The output from running a +-covered work is covered by this License only if the output, given its +-content, constitutes a covered work. This License acknowledges your +-rights of fair use or other equivalent, as provided by copyright law. +- +- You may make, run and propagate covered works that you do not +-convey, without conditions so long as your license otherwise remains +-in force. You may convey covered works to others for the sole purpose +-of having them make modifications exclusively for you, or provide you +-with facilities for running those works, provided that you comply with +-the terms of this License in conveying all material for which you do +-not control copyright. Those thus making or running the covered works +-for you must do so exclusively on your behalf, under your direction +-and control, on terms that prohibit them from making any copies of +-your copyrighted material outside their relationship with you. +- +- Conveying under any other circumstances is permitted solely under +-the conditions stated below. Sublicensing is not allowed; section 10 +-makes it unnecessary. +- +- 3. Protecting Users' Legal Rights From Anti-Circumvention Law. +- +- No covered work shall be deemed part of an effective technological +-measure under any applicable law fulfilling obligations under article +-11 of the WIPO copyright treaty adopted on 20 December 1996, or +-similar laws prohibiting or restricting circumvention of such +-measures. +- +- When you convey a covered work, you waive any legal power to forbid +-circumvention of technological measures to the extent such circumvention +-is effected by exercising rights under this License with respect to +-the covered work, and you disclaim any intention to limit operation or +-modification of the work as a means of enforcing, against the work's +-users, your or third parties' legal rights to forbid circumvention of +-technological measures. +- +- 4. Conveying Verbatim Copies. +- +- You may convey verbatim copies of the Program's source code as you +-receive it, in any medium, provided that you conspicuously and +-appropriately publish on each copy an appropriate copyright notice; +-keep intact all notices stating that this License and any +-non-permissive terms added in accord with section 7 apply to the code; +-keep intact all notices of the absence of any warranty; and give all +-recipients a copy of this License along with the Program. +- +- You may charge any price or no price for each copy that you convey, +-and you may offer support or warranty protection for a fee. +- +- 5. Conveying Modified Source Versions. +- +- You may convey a work based on the Program, or the modifications to +-produce it from the Program, in the form of source code under the +-terms of section 4, provided that you also meet all of these conditions: +- +- a) The work must carry prominent notices stating that you modified +- it, and giving a relevant date. +- +- b) The work must carry prominent notices stating that it is +- released under this License and any conditions added under section +- 7. This requirement modifies the requirement in section 4 to +- "keep intact all notices". +- +- c) You must license the entire work, as a whole, under this +- License to anyone who comes into possession of a copy. This +- License will therefore apply, along with any applicable section 7 +- additional terms, to the whole of the work, and all its parts, +- regardless of how they are packaged. This License gives no +- permission to license the work in any other way, but it does not +- invalidate such permission if you have separately received it. +- +- d) If the work has interactive user interfaces, each must display +- Appropriate Legal Notices; however, if the Program has interactive +- interfaces that do not display Appropriate Legal Notices, your +- work need not make them do so. +- +- A compilation of a covered work with other separate and independent +-works, which are not by their nature extensions of the covered work, +-and which are not combined with it such as to form a larger program, +-in or on a volume of a storage or distribution medium, is called an +-"aggregate" if the compilation and its resulting copyright are not +-used to limit the access or legal rights of the compilation's users +-beyond what the individual works permit. Inclusion of a covered work +-in an aggregate does not cause this License to apply to the other +-parts of the aggregate. +- +- 6. Conveying Non-Source Forms. +- +- You may convey a covered work in object code form under the terms +-of sections 4 and 5, provided that you also convey the +-machine-readable Corresponding Source under the terms of this License, +-in one of these ways: +- +- a) Convey the object code in, or embodied in, a physical product +- (including a physical distribution medium), accompanied by the +- Corresponding Source fixed on a durable physical medium +- customarily used for software interchange. +- +- b) Convey the object code in, or embodied in, a physical product +- (including a physical distribution medium), accompanied by a +- written offer, valid for at least three years and valid for as +- long as you offer spare parts or customer support for that product +- model, to give anyone who possesses the object code either (1) a +- copy of the Corresponding Source for all the software in the +- product that is covered by this License, on a durable physical +- medium customarily used for software interchange, for a price no +- more than your reasonable cost of physically performing this +- conveying of source, or (2) access to copy the +- Corresponding Source from a network server at no charge. +- +- c) Convey individual copies of the object code with a copy of the +- written offer to provide the Corresponding Source. This +- alternative is allowed only occasionally and noncommercially, and +- only if you received the object code with such an offer, in accord +- with subsection 6b. +- +- d) Convey the object code by offering access from a designated +- place (gratis or for a charge), and offer equivalent access to the +- Corresponding Source in the same way through the same place at no +- further charge. You need not require recipients to copy the +- Corresponding Source along with the object code. If the place to +- copy the object code is a network server, the Corresponding Source +- may be on a different server (operated by you or a third party) +- that supports equivalent copying facilities, provided you maintain +- clear directions next to the object code saying where to find the +- Corresponding Source. Regardless of what server hosts the +- Corresponding Source, you remain obligated to ensure that it is +- available for as long as needed to satisfy these requirements. +- +- e) Convey the object code using peer-to-peer transmission, provided +- you inform other peers where the object code and Corresponding +- Source of the work are being offered to the general public at no +- charge under subsection 6d. +- +- A separable portion of the object code, whose source code is excluded +-from the Corresponding Source as a System Library, need not be +-included in conveying the object code work. +- +- A "User Product" is either (1) a "consumer product", which means any +-tangible personal property which is normally used for personal, family, +-or household purposes, or (2) anything designed or sold for incorporation +-into a dwelling. In determining whether a product is a consumer product, +-doubtful cases shall be resolved in favor of coverage. For a particular +-product received by a particular user, "normally used" refers to a +-typical or common use of that class of product, regardless of the status +-of the particular user or of the way in which the particular user +-actually uses, or expects or is expected to use, the product. A product +-is a consumer product regardless of whether the product has substantial +-commercial, industrial or non-consumer uses, unless such uses represent +-the only significant mode of use of the product. +- +- "Installation Information" for a User Product means any methods, +-procedures, authorization keys, or other information required to install +-and execute modified versions of a covered work in that User Product from +-a modified version of its Corresponding Source. The information must +-suffice to ensure that the continued functioning of the modified object +-code is in no case prevented or interfered with solely because +-modification has been made. +- +- If you convey an object code work under this section in, or with, or +-specifically for use in, a User Product, and the conveying occurs as +-part of a transaction in which the right of possession and use of the +-User Product is transferred to the recipient in perpetuity or for a +-fixed term (regardless of how the transaction is characterized), the +-Corresponding Source conveyed under this section must be accompanied +-by the Installation Information. But this requirement does not apply +-if neither you nor any third party retains the ability to install +-modified object code on the User Product (for example, the work has +-been installed in ROM). +- +- The requirement to provide Installation Information does not include a +-requirement to continue to provide support service, warranty, or updates +-for a work that has been modified or installed by the recipient, or for +-the User Product in which it has been modified or installed. Access to a +-network may be denied when the modification itself materially and +-adversely affects the operation of the network or violates the rules and +-protocols for communication across the network. +- +- Corresponding Source conveyed, and Installation Information provided, +-in accord with this section must be in a format that is publicly +-documented (and with an implementation available to the public in +-source code form), and must require no special password or key for +-unpacking, reading or copying. +- +- 7. Additional Terms. +- +- "Additional permissions" are terms that supplement the terms of this +-License by making exceptions from one or more of its conditions. +-Additional permissions that are applicable to the entire Program shall +-be treated as though they were included in this License, to the extent +-that they are valid under applicable law. If additional permissions +-apply only to part of the Program, that part may be used separately +-under those permissions, but the entire Program remains governed by +-this License without regard to the additional permissions. +- +- When you convey a copy of a covered work, you may at your option +-remove any additional permissions from that copy, or from any part of +-it. (Additional permissions may be written to require their own +-removal in certain cases when you modify the work.) You may place +-additional permissions on material, added by you to a covered work, +-for which you have or can give appropriate copyright permission. +- +- Notwithstanding any other provision of this License, for material you +-add to a covered work, you may (if authorized by the copyright holders of +-that material) supplement the terms of this License with terms: +- +- a) Disclaiming warranty or limiting liability differently from the +- terms of sections 15 and 16 of this License; or +- +- b) Requiring preservation of specified reasonable legal notices or +- author attributions in that material or in the Appropriate Legal +- Notices displayed by works containing it; or +- +- c) Prohibiting misrepresentation of the origin of that material, or +- requiring that modified versions of such material be marked in +- reasonable ways as different from the original version; or +- +- d) Limiting the use for publicity purposes of names of licensors or +- authors of the material; or +- +- e) Declining to grant rights under trademark law for use of some +- trade names, trademarks, or service marks; or +- +- f) Requiring indemnification of licensors and authors of that +- material by anyone who conveys the material (or modified versions of +- it) with contractual assumptions of liability to the recipient, for +- any liability that these contractual assumptions directly impose on +- those licensors and authors. +- +- All other non-permissive additional terms are considered "further +-restrictions" within the meaning of section 10. If the Program as you +-received it, or any part of it, contains a notice stating that it is +-governed by this License along with a term that is a further +-restriction, you may remove that term. If a license document contains +-a further restriction but permits relicensing or conveying under this +-License, you may add to a covered work material governed by the terms +-of that license document, provided that the further restriction does +-not survive such relicensing or conveying. +- +- If you add terms to a covered work in accord with this section, you +-must place, in the relevant source files, a statement of the +-additional terms that apply to those files, or a notice indicating +-where to find the applicable terms. +- +- Additional terms, permissive or non-permissive, may be stated in the +-form of a separately written license, or stated as exceptions; +-the above requirements apply either way. +- +- 8. Termination. +- +- You may not propagate or modify a covered work except as expressly +-provided under this License. Any attempt otherwise to propagate or +-modify it is void, and will automatically terminate your rights under +-this License (including any patent licenses granted under the third +-paragraph of section 11). +- +- However, if you cease all violation of this License, then your +-license from a particular copyright holder is reinstated (a) +-provisionally, unless and until the copyright holder explicitly and +-finally terminates your license, and (b) permanently, if the copyright +-holder fails to notify you of the violation by some reasonable means +-prior to 60 days after the cessation. +- +- Moreover, your license from a particular copyright holder is +-reinstated permanently if the copyright holder notifies you of the +-violation by some reasonable means, this is the first time you have +-received notice of violation of this License (for any work) from that +-copyright holder, and you cure the violation prior to 30 days after +-your receipt of the notice. +- +- Termination of your rights under this section does not terminate the +-licenses of parties who have received copies or rights from you under +-this License. If your rights have been terminated and not permanently +-reinstated, you do not qualify to receive new licenses for the same +-material under section 10. +- +- 9. Acceptance Not Required for Having Copies. +- +- You are not required to accept this License in order to receive or +-run a copy of the Program. Ancillary propagation of a covered work +-occurring solely as a consequence of using peer-to-peer transmission +-to receive a copy likewise does not require acceptance. However, +-nothing other than this License grants you permission to propagate or +-modify any covered work. These actions infringe copyright if you do +-not accept this License. Therefore, by modifying or propagating a +-covered work, you indicate your acceptance of this License to do so. +- +- 10. Automatic Licensing of Downstream Recipients. +- +- Each time you convey a covered work, the recipient automatically +-receives a license from the original licensors, to run, modify and +-propagate that work, subject to this License. You are not responsible +-for enforcing compliance by third parties with this License. +- +- An "entity transaction" is a transaction transferring control of an +-organization, or substantially all assets of one, or subdividing an +-organization, or merging organizations. If propagation of a covered +-work results from an entity transaction, each party to that +-transaction who receives a copy of the work also receives whatever +-licenses to the work the party's predecessor in interest had or could +-give under the previous paragraph, plus a right to possession of the +-Corresponding Source of the work from the predecessor in interest, if +-the predecessor has it or can get it with reasonable efforts. +- +- You may not impose any further restrictions on the exercise of the +-rights granted or affirmed under this License. For example, you may +-not impose a license fee, royalty, or other charge for exercise of +-rights granted under this License, and you may not initiate litigation +-(including a cross-claim or counterclaim in a lawsuit) alleging that +-any patent claim is infringed by making, using, selling, offering for +-sale, or importing the Program or any portion of it. +- +- 11. Patents. +- +- A "contributor" is a copyright holder who authorizes use under this +-License of the Program or a work on which the Program is based. The +-work thus licensed is called the contributor's "contributor version". +- +- A contributor's "essential patent claims" are all patent claims +-owned or controlled by the contributor, whether already acquired or +-hereafter acquired, that would be infringed by some manner, permitted +-by this License, of making, using, or selling its contributor version, +-but do not include claims that would be infringed only as a +-consequence of further modification of the contributor version. For +-purposes of this definition, "control" includes the right to grant +-patent sublicenses in a manner consistent with the requirements of +-this License. +- +- Each contributor grants you a non-exclusive, worldwide, royalty-free +-patent license under the contributor's essential patent claims, to +-make, use, sell, offer for sale, import and otherwise run, modify and +-propagate the contents of its contributor version. +- +- In the following three paragraphs, a "patent license" is any express +-agreement or commitment, however denominated, not to enforce a patent +-(such as an express permission to practice a patent or covenant not to +-sue for patent infringement). To "grant" such a patent license to a +-party means to make such an agreement or commitment not to enforce a +-patent against the party. +- +- If you convey a covered work, knowingly relying on a patent license, +-and the Corresponding Source of the work is not available for anyone +-to copy, free of charge and under the terms of this License, through a +-publicly available network server or other readily accessible means, +-then you must either (1) cause the Corresponding Source to be so +-available, or (2) arrange to deprive yourself of the benefit of the +-patent license for this particular work, or (3) arrange, in a manner +-consistent with the requirements of this License, to extend the patent +-license to downstream recipients. "Knowingly relying" means you have +-actual knowledge that, but for the patent license, your conveying the +-covered work in a country, or your recipient's use of the covered work +-in a country, would infringe one or more identifiable patents in that +-country that you have reason to believe are valid. +- +- If, pursuant to or in connection with a single transaction or +-arrangement, you convey, or propagate by procuring conveyance of, a +-covered work, and grant a patent license to some of the parties +-receiving the covered work authorizing them to use, propagate, modify +-or convey a specific copy of the covered work, then the patent license +-you grant is automatically extended to all recipients of the covered +-work and works based on it. +- +- A patent license is "discriminatory" if it does not include within +-the scope of its coverage, prohibits the exercise of, or is +-conditioned on the non-exercise of one or more of the rights that are +-specifically granted under this License. You may not convey a covered +-work if you are a party to an arrangement with a third party that is +-in the business of distributing software, under which you make payment +-to the third party based on the extent of your activity of conveying +-the work, and under which the third party grants, to any of the +-parties who would receive the covered work from you, a discriminatory +-patent license (a) in connection with copies of the covered work +-conveyed by you (or copies made from those copies), or (b) primarily +-for and in connection with specific products or compilations that +-contain the covered work, unless you entered into that arrangement, +-or that patent license was granted, prior to 28 March 2007. +- +- Nothing in this License shall be construed as excluding or limiting +-any implied license or other defenses to infringement that may +-otherwise be available to you under applicable patent law. +- +- 12. No Surrender of Others' Freedom. +- +- If conditions are imposed on you (whether by court order, agreement or +-otherwise) that contradict the conditions of this License, they do not +-excuse you from the conditions of this License. If you cannot convey a +-covered work so as to satisfy simultaneously your obligations under this +-License and any other pertinent obligations, then as a consequence you may +-not convey it at all. For example, if you agree to terms that obligate you +-to collect a royalty for further conveying from those to whom you convey +-the Program, the only way you could satisfy both those terms and this +-License would be to refrain entirely from conveying the Program. +- +- 13. Use with the GNU Affero General Public License. +- +- Notwithstanding any other provision of this License, you have +-permission to link or combine any covered work with a work licensed +-under version 3 of the GNU Affero General Public License into a single +-combined work, and to convey the resulting work. The terms of this +-License will continue to apply to the part which is the covered work, +-but the special requirements of the GNU Affero General Public License, +-section 13, concerning interaction through a network will apply to the +-combination as such. +- +- 14. Revised Versions of this License. +- +- The Free Software Foundation may publish revised and/or new versions of +-the GNU General Public License from time to time. Such new versions will +-be similar in spirit to the present version, but may differ in detail to +-address new problems or concerns. +- +- Each version is given a distinguishing version number. If the +-Program specifies that a certain numbered version of the GNU General +-Public License "or any later version" applies to it, you have the +-option of following the terms and conditions either of that numbered +-version or of any later version published by the Free Software +-Foundation. If the Program does not specify a version number of the +-GNU General Public License, you may choose any version ever published +-by the Free Software Foundation. +- +- If the Program specifies that a proxy can decide which future +-versions of the GNU General Public License can be used, that proxy's +-public statement of acceptance of a version permanently authorizes you +-to choose that version for the Program. +- +- Later license versions may give you additional or different +-permissions. However, no additional obligations are imposed on any +-author or copyright holder as a result of your choosing to follow a +-later version. +- +- 15. Disclaimer of Warranty. +- +- THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +-APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +-PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +-IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +-ALL NECESSARY SERVICING, REPAIR OR CORRECTION. +- +- 16. Limitation of Liability. +- +- IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +-SUCH DAMAGES. +- +- 17. Interpretation of Sections 15 and 16. +- +- If the disclaimer of warranty and limitation of liability provided +-above cannot be given local legal effect according to their terms, +-reviewing courts shall apply local law that most closely approximates +-an absolute waiver of all civil liability in connection with the +-Program, unless a warranty or assumption of liability accompanies a +-copy of the Program in return for a fee. +- +- END OF TERMS AND CONDITIONS +- +- How to Apply These Terms to Your New Programs +- +- If you develop a new program, and you want it to be of the greatest +-possible use to the public, the best way to achieve this is to make it +-free software which everyone can redistribute and change under these terms. +- +- To do so, attach the following notices to the program. It is safest +-to attach them to the start of each source file to most effectively +-state the exclusion of warranty; and each file should have at least +-the "copyright" line and a pointer to where the full notice is found. +- +- +- Copyright (C) +- +- This program is free software: you can redistribute it and/or modify +- it under the terms of the GNU General Public License as published by +- the Free Software Foundation, either version 3 of the License, or +- (at your option) any later version. +- +- This program is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +- GNU General Public License for more details. +- +- You should have received a copy of the GNU General Public License +- along with this program. If not, see . +- +-Also add information on how to contact you by electronic and paper mail. +- +- If the program does terminal interaction, make it output a short +-notice like this when it starts in an interactive mode: +- +- Copyright (C) +- This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. +- This is free software, and you are welcome to redistribute it +- under certain conditions; type `show c' for details. +- +-The hypothetical commands `show w' and `show c' should show the appropriate +-parts of the General Public License. Of course, your program's commands +-might be different; for a GUI interface, you would use an "about box". +- +- You should also get your employer (if you work as a programmer) or school, +-if any, to sign a "copyright disclaimer" for the program, if necessary. +-For more information on this, and how to apply and follow the GNU GPL, see +-. +- +- The GNU General Public License does not permit incorporating your program +-into proprietary programs. If your program is a subroutine library, you +-may consider it more useful to permit linking proprietary applications with +-the library. If this is what you want to do, use the GNU Lesser General +-Public License instead of this License. But first, please read +-. +diff --git a/lib/PBA/ProgramCU.cu b/lib/PBA/ProgramCU.cu +deleted file mode 100644 +index 890c20f..0000000 +--- a/lib/PBA/ProgramCU.cu ++++ /dev/null +@@ -1,3637 +0,0 @@ +-//////////////////////////////////////////////////////////////////////////// +-// File: ProgramCU.cu +-// Author: Changchang Wu +-// Description : implementation of ProgramCU and all CUDA kernels +-// +-// Copyright (c) 2011 Changchang Wu (ccwu@cs.washington.edu) +-// and the University of Washington at Seattle +-// +-// This library is free software; you can redistribute it and/or +-// modify it under the terms of the GNU General Public +-// License as published by the Free Software Foundation; either +-// Version 3 of the License, or (at your option) any later version. +-// +-// This library is distributed in the hope that it will be useful, +-// but WITHOUT ANY WARRANTY; without even the implied warranty of +-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +-// General Public License for more details. +-// +-//////////////////////////////////////////////////////////////////////////////// +- +-#include +-#include +-#include "CuTexImage.h" +-#include "ProgramCU.h" +- +-#define IMUL(X, Y) __mul24(X, Y) +-#define FDIV(X, Y) __fdividef(X, Y) +-#define FDIV2(X, Y) ((X) / (Y)) +-#define MAX_BLOCKLEN 65535 +-#define MAX_BLOCKLEN_ALIGN 65504 +-#define MAX_TEXSIZE (1 << 29) +-#define TEX_TOOBIG4(sz) (sz >> 31) +-#define REDUCTION_NBLOCK 32 +- +-namespace pba { +- +-inline void CuTexImage::BindTexture(textureReference& texRef) { +- size_t sz = GetDataSize(); +- if (sz > MAX_TEXSIZE) +- fprintf(stderr, "cudaBindTexture: %lX > %d\n", sz, MAX_TEXSIZE); +- cudaError_t e = +- cudaBindTexture(NULL, &texRef, data(), &texRef.channelDesc, sz); +-} +- +-inline void CuTexImage::BindTexture(textureReference& texRef, int offset, +- size_t size) { +- cudaError_t e = cudaBindTexture(NULL, &texRef, (char*)_cuData + offset, +- &texRef.channelDesc, size); +- if (e) fprintf(stderr, "cudaBindTexture: none-zero offset\n"); +-} +- +-inline void CuTexImage::BindTexture2(textureReference& texRef1, +- textureReference& texRef2) { +- size_t sz = GetDataSize(); +- if (sz <= MAX_TEXSIZE) { +- BindTexture(texRef1); +- } else { +- BindTexture(texRef1, 0, MAX_TEXSIZE); +- BindTexture(texRef2, MAX_TEXSIZE, sz - MAX_TEXSIZE); +- } +-} +- +-inline void CuTexImage::BindTexture4(textureReference& texRef1, +- textureReference& texRef2, +- textureReference& texRef3, +- textureReference& texRef4) { +- size_t sz = GetDataSize(); +- if (sz <= MAX_TEXSIZE) { +- BindTexture(texRef1); +- } else { +- BindTexture(texRef1, 0, MAX_TEXSIZE); +- if (sz <= 2 * MAX_TEXSIZE) { +- BindTexture(texRef2, MAX_TEXSIZE, sz - MAX_TEXSIZE); +- } else { +- BindTexture(texRef2, MAX_TEXSIZE, MAX_TEXSIZE); +- if (sz <= 3 * MAX_TEXSIZE) { +- BindTexture(texRef3, MAX_TEXSIZE * 2, sz - MAX_TEXSIZE * 2); +- } else { +- BindTexture(texRef3, MAX_TEXSIZE * 2, MAX_TEXSIZE); +- BindTexture(texRef4, MAX_TEXSIZE * 3, sz - MAX_TEXSIZE * 3); +- } +- } +- } +-} +- +-inline int CuTexImage::BindTextureX(textureReference& texRef1, +- textureReference& texRef2, +- textureReference& texRef3, +- textureReference& texRef4, bool force4) { +- size_t szjc = GetDataSize(); +- if (TEX_TOOBIG4(szjc)) { +- return 0; +- } else if (force4) { +- BindTexture4(texRef1, texRef2, texRef3, texRef4); +- return 4; +- } else if (szjc > 2 * MAX_TEXSIZE) { +- return 0; +- } else if (szjc > MAX_TEXSIZE) { +- BindTexture2(texRef1, texRef2); +- return 2; +- } else { +- BindTexture(texRef1); +- return 1; +- } +-} +- +-void ProgramCU::FinishWorkCUDA() { cudaThreadSynchronize(); } +- +-int ProgramCU::CheckErrorCUDA(const char* location) { +- cudaError_t e = cudaGetLastError(); +- if (e) { +- if (location) fprintf(stderr, "%s:\t", location); +- fprintf(stderr, "%s(%d)\n", cudaGetErrorString(e), e); +- throw location; +- } else { +- // fprintf(stderr, "%s:\n", location); +- return 0; +- } +-} +- +-inline void ProgramCU::GetBlockConfiguration(unsigned int nblock, +- unsigned int& bw, +- unsigned int& bh) { +- if (nblock <= MAX_BLOCKLEN) { +- bw = nblock; +- bh = 1; +- } else { +- bh = (nblock + MAX_BLOCKLEN_ALIGN - 1) / MAX_BLOCKLEN_ALIGN; +- bw = (nblock + bh - 1) / bh; +- bw = ((bw + 31) / 32) * 32; +- bh = (nblock + bw - 1) / bw; +- } +-} +- +-void ProgramCU::ClearPreviousError() { cudaGetLastError(); } +- +-void ProgramCU::ResetCurrentDevice() { +- int device = 0; +- cudaGetDevice(&device); +- cudaDeviceReset(); +- if (device > 0) cudaSetDevice(device); +-} +- +-size_t ProgramCU::GetCudaMemoryCap() { +- int device; +- if (cudaGetDevice(&device) != cudaSuccess) return 0; +- cudaDeviceProp prop; +- if (cudaGetDeviceProperties(&prop, device) == cudaSuccess) { +- if (prop.major == 9999 && prop.minor == 9999) return 0; +- return prop.totalGlobalMem; +- } else +- return 0; +-} +-int ProgramCU::SetCudaDevice(int device) { +- int count = 0, device_used; +- if (cudaGetDeviceCount(&count) || count <= 0) { +- ProgramCU::CheckErrorCUDA("CheckCudaDevice"); +- return 0; +- } else if (count == 1) { +- cudaDeviceProp deviceProp; +- if (cudaGetDeviceProperties(&deviceProp, 0) != cudaSuccess) { +- fprintf(stderr, "CheckCudaDevice: no device supporting CUDA.\n"); +- return 0; +- } +- if (deviceProp.major == 9999 && deviceProp.minor == 9999) { +- fprintf(stderr, "CheckCudaDevice: no device supporting CUDA.\n"); +- return 0; +- } +- } +- +- if (device > 0 && device < count) { +- cudaSetDevice(device); +- CheckErrorCUDA("cudaSetDevice\n"); +- } +- cudaGetDevice(&device_used); +- if (device != device_used) +- fprintf(stderr, +- "ERROR: Cannot set device to %d\n" +- "WARNING: Use device-%d instead (out of %d)\n", +- device, device_used, count); +- return 1; +-} +- +-#define WARP_REDUCTION_32(value) \ +- __syncthreads(); \ +- if (threadIdx.x < 16) value[threadIdx.x] += value[threadIdx.x + 16]; \ +- if (threadIdx.x < 8) value[threadIdx.x] += value[threadIdx.x + 8]; \ +- if (threadIdx.x < 4) value[threadIdx.x] += value[threadIdx.x + 4]; \ +- if (threadIdx.x < 2) value[threadIdx.x] += value[threadIdx.x + 2]; +- +-#define WARP_REDUCTION_64(value) \ +- __syncthreads(); \ +- if (threadIdx.x < 32) value[threadIdx.x] += value[threadIdx.x + 32]; \ +- WARP_REDUCTION_32(value) +- +-#define WARP_REDUCTION_128(value) \ +- __syncthreads(); \ +- if (threadIdx.x < 64) value[threadIdx.x] += value[threadIdx.x + 64]; \ +- WARP_REDUCTION_64(value) +- +-#define WARP_REDUCTION_256(value) \ +- __syncthreads(); \ +- if (threadIdx.x < 128) value[threadIdx.x] += value[threadIdx.x + 128]; \ +- WARP_REDUCTION_128(value) +- +-__global__ void vector_max_kernel(const float* x, int len, int blen, +- float* result) { +- __shared__ float value[256]; +- int bstart = blen * blockIdx.x; +- int start = bstart + threadIdx.x; +- int end = min(len, bstart + blen); +- +- float v = 0; +- for (int i = start; i < end; i += blockDim.x) v = max(v, fabs(x[i])); +- value[threadIdx.x] = v; +- // reduce to the first two values +- __syncthreads(); +- if (threadIdx.x < 128) +- value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 128]); +- __syncthreads(); +- if (threadIdx.x < 64) +- value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 64]); +- __syncthreads(); +- if (threadIdx.x < 32) +- value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 32]); +- if (threadIdx.x < 16) +- value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 16]); +- if (threadIdx.x < 8) +- value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 8]); +- if (threadIdx.x < 4) +- value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 4]); +- if (threadIdx.x < 2) +- value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 2]); +- // write back +- if (threadIdx.x == 0) result[blockIdx.x] = max(value[0], value[1]); +-} +- +-float ProgramCU::ComputeVectorMax(CuTexImage& vector, CuTexImage& buf) { +- const unsigned int nblock = 32; +- const unsigned int bsize = 256; +- int len = vector.GetLength(); +- int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize; +- +- //////////////////////////////// +- dim3 grid(nblock), block(bsize); +- +- ///////////////////////////////// +- buf.InitTexture(nblock, 1); +- vector_max_kernel<<>>(vector.data(), len, blen, buf.data()); +- ProgramCU::CheckErrorCUDA("ComputeVectorMax"); +- +- float data[nblock], result = 0; +- buf.CopyToHost(data); +- for (unsigned int i = 0; i < nblock; ++i) result = max(result, data[i]); +- return result; +-} +- +-__global__ void vector_norm_kernel(const float* x, int len, int blen, +- float* result) { +- __shared__ float value[256]; +- int bstart = blen * blockIdx.x; +- int start = bstart + threadIdx.x; +- int end = min(len, bstart + blen); +- +- float v = 0; +- for (int i = start; i < end; i += blockDim.x) { +- float temp = x[i]; +- v += (temp * temp); +- } +- value[threadIdx.x] = v; +- // reduce to the first two values +- WARP_REDUCTION_256(value); +- +- // write back +- if (threadIdx.x == 0) result[blockIdx.x] = (value[0] + value[1]); +-} +- +-double ProgramCU::ComputeVectorNorm(CuTexImage& vector, CuTexImage& buf) { +- const unsigned int nblock = REDUCTION_NBLOCK; +- unsigned int bsize = 256; +- int len = vector.GetLength(); +- int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize; +- +- //////////////////////////////// +- dim3 grid(nblock), block(bsize); +- +- ///////////////////////////////// +- buf.InitTexture(nblock, 1); +- vector_norm_kernel<<>>(vector.data(), len, blen, buf.data()); +- ProgramCU::CheckErrorCUDA("ComputeVectorNorm"); +- +- float data[nblock]; +- buf.CopyToHost(data); +- double result = 0; +- for (unsigned int i = 0; i < nblock; ++i) result += data[i]; +- return result; +-} +- +-__global__ void vector_sum_kernel(const float* x, int len, int blen, +- float* result) { +- __shared__ float value[256]; +- int bstart = blen * blockIdx.x; +- int start = bstart + threadIdx.x; +- int end = min(len, bstart + blen); +- float v = 0; +- for (int i = start; i < end; i += blockDim.x) v += x[i]; +- +- value[threadIdx.x] = v; +- // reduce to the first two values +- WARP_REDUCTION_256(value); +- +- // write back +- if (threadIdx.x == 0) result[blockIdx.x] = (value[0] + value[1]); +-} +- +-float ProgramCU::ComputeVectorSum(CuTexImage& vector, CuTexImage& buf, +- int skip) { +- const unsigned int nblock = REDUCTION_NBLOCK; +- unsigned int bsize = 256; +- int len = vector.GetLength() - skip; +- int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize; +- +- //////////////////////////////// +- dim3 grid(nblock), block(bsize); +- +- ///////////////////////////////// +- buf.InitTexture(nblock, 1); +- vector_sum_kernel<<>>((vector.data()) + skip, len, blen, +- buf.data()); +- ProgramCU::CheckErrorCUDA("ComputeVectorSum"); +- +- float data[nblock]; +- buf.CopyToHost(data); +- double result = 0; +- for (unsigned int i = 0; i < nblock; ++i) result += data[i]; +- return (float)result; +-} +- +-__global__ void vector_dotproduct_kernel(const float* a, const float* b, +- int len, int blen, float* result) { +- __shared__ float value[256]; +- int bstart = blen * blockIdx.x; +- int start = bstart + threadIdx.x; +- int end = min(len, bstart + blen); +- +- float v = 0; +- for (int i = start; i < end; i += blockDim.x) v += (a[i] * b[i]); +- value[threadIdx.x] = v; +- +- // reduce to the first two values +- WARP_REDUCTION_256(value); +- +- // write back +- if (threadIdx.x == 0) result[blockIdx.x] = (value[0] + value[1]); +-} +- +-double ProgramCU::ComputeVectorDot(CuTexImage& vector1, CuTexImage& vector2, +- CuTexImage& buf) { +- const unsigned int nblock = REDUCTION_NBLOCK; +- unsigned int bsize = 256; +- int len = vector1.GetLength(); +- int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize; +- +- //////////////////////////////// +- dim3 grid(nblock), block(bsize); +- +- ///////////////////////////////// +- buf.InitTexture(nblock, 1); +- vector_dotproduct_kernel<<>>(vector1.data(), vector2.data(), len, +- blen, buf.data()); +- ProgramCU::CheckErrorCUDA("ComputeVectorDot"); +- +- float data[nblock]; +- buf.CopyToHost(data); +- +- double result = 0; +- for (unsigned int i = 0; i < nblock; ++i) result += data[i]; +- return result; +-} +- +-__global__ void vector_weighted_norm_kernel(const float* vec, const float* w, +- int len, int blen, float* result) { +- __shared__ float value[256]; +- int bstart = blen * blockIdx.x; +- int start = bstart + threadIdx.x; +- int end = min(len, bstart + blen); +- +- float v = 0; +- for (int i = start; i < end; i += blockDim.x) v += (vec[i] * w[i] * vec[i]); +- value[threadIdx.x] = v; +- +- // reduce to the first two values +- WARP_REDUCTION_256(value); +- +- // write back +- if (threadIdx.x == 0) result[blockIdx.x] = (value[0] + value[1]); +-} +- +-double ProgramCU::ComputeVectorNormW(CuTexImage& vector, CuTexImage& weight, +- CuTexImage& buf) { +- if (weight.IsValid()) { +- const unsigned int nblock = REDUCTION_NBLOCK; +- unsigned int bsize = 256; +- int len = vector.GetLength(); +- int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize; +- +- //////////////////////////////// +- dim3 grid(nblock), block(bsize); +- +- ///////////////////////////////// +- buf.InitTexture(nblock, 1); +- +- vector_weighted_norm_kernel<<>>(vector.data(), weight.data(), +- len, blen, buf.data()); +- +- ProgramCU::CheckErrorCUDA("ComputeVectorNormW"); +- +- float data[nblock]; +- buf.CopyToHost(data); +- +- double result = 0; +- for (unsigned int i = 0; i < nblock; ++i) result += data[i]; +- return result; +- } else { +- return ComputeVectorNorm(vector, buf); +- } +-} +-// given vector x, y, and a weight a +-// return a * x + y +-__global__ void saxpy_kernel(const float a, const float* x, const float* y, +- float* result, unsigned int len) { +- unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; +- if (idx < len) result[idx] = a * x[idx] + y[idx]; +-} +- +-__global__ void saxpy_kernel_large(const float a, const float* x, +- const float* y, float* result, +- unsigned int len, unsigned int rowsz) { +- unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz; +- if (idx < len) result[idx] = a * x[idx] + y[idx]; +-} +- +-void ProgramCU::ComputeSAXPY(float a, CuTexImage& texX, CuTexImage& texY, +- CuTexImage& result) { +- unsigned int len = result.GetLength(); +- unsigned int bsize = 128; +- unsigned int nblock = (len + bsize - 1) / bsize; +- if (nblock > MAX_BLOCKLEN) { +- unsigned int bw, bh; +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- saxpy_kernel_large<<>>(a, texX.data(), texY.data(), +- result.data(), len, bw * bsize); +- } else { +- dim3 grid(nblock), block(bsize); +- saxpy_kernel<<>>(a, texX.data(), texY.data(), result.data(), +- len); +- } +- ProgramCU::CheckErrorCUDA("ComputeSAXPY"); +-} +- +-__global__ void sxypz_kernel_large(float a, const float* x, const float* y, +- const float* z, float* result, +- unsigned int len, unsigned int rowsz) { +- unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz; +- if (idx < len) result[idx] = a * x[idx] * y[idx] + z[idx]; +-} +- +-void ProgramCU::ComputeSXYPZ(float a, CuTexImage& texX, CuTexImage& texY, +- CuTexImage& texZ, CuTexImage& result) { +- if (texX.IsValid()) { +- unsigned int len = texX.GetLength(); +- unsigned int bsize = 128; +- unsigned int nblock = (len + bsize - 1) / bsize; +- unsigned int bw, bh; +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- sxypz_kernel_large<<>>(a, texX.data(), texY.data(), +- texZ.data(), result.data(), len, +- bw * bsize); +- } else { +- ComputeSAXPY(a, texY, texZ, result); +- } +-} +- +-__global__ void vxy_kernel(const float* x, float* y, float* result, +- unsigned int len) { +- unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; +- if (idx < len) result[idx] = x[idx] * y[idx]; +-} +- +-__global__ void vxy_kernel_large(const float* x, float* y, float* result, +- unsigned int len, unsigned int rowsz) { +- unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x + rowsz * blockIdx.y; +- if (idx < len) result[idx] = x[idx] * y[idx]; +-} +- +-void ProgramCU::ComputeVXY(CuTexImage& texX, CuTexImage& texY, +- CuTexImage& result, unsigned int part, +- unsigned int skip) { +- unsigned int len = part ? part : texX.GetLength(); +- unsigned int bsize = 128; +- unsigned int nblock = (len + bsize - 1) / bsize; +- if (nblock > MAX_BLOCKLEN) { +- unsigned int bw, bh; +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- vxy_kernel_large<<>>(texX.data() + skip, texY.data() + skip, +- result.data() + skip, len, bsize * bw); +- } else { +- dim3 grid(nblock), block(bsize); +- vxy_kernel<<>>(texX.data() + skip, texY.data() + skip, +- result.data() + skip, len); +- } +- ProgramCU::CheckErrorCUDA("ComputeVXY"); +-} +- +-__global__ void sqrt_kernel_large(float* x, unsigned int len, +- unsigned int rowsz) { +- unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz; +- if (idx < len) x[idx] = sqrt(x[idx]); +-} +- +-void ProgramCU::ComputeSQRT(CuTexImage& tex) { +- unsigned int len = tex.GetLength(); +- unsigned int bsize = 128; +- unsigned int nblock = (len + bsize - 1) / bsize; +- unsigned int bw, bh; +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- sqrt_kernel_large<<>>(tex.data(), len, bw * bsize); +- ProgramCU::CheckErrorCUDA("ComputeSQRT"); +-} +- +-__global__ void rsqrt_kernel_large(float* x, unsigned int len, +- unsigned int rowsz) { +- unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz; +- if (idx < len) x[idx] = x[idx] > 0 ? rsqrt(x[idx]) : 0; +-} +- +-void ProgramCU::ComputeRSQRT(CuTexImage& tex) { +- unsigned int len = tex.GetLength(); +- unsigned int bsize = 128; +- unsigned int nblock = (len + bsize - 1) / bsize; +- unsigned int bw, bh; +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- rsqrt_kernel_large<<>>(tex.data(), len, bw * bsize); +- +- ProgramCU::CheckErrorCUDA("ComputeRSQRT"); +-} +- +-__global__ void sax_kernel(const float a, const float* x, float* result, +- unsigned int len) { +- unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; +- if (idx < len) result[idx] = a * x[idx]; +-} +- +-__global__ void sax_kernel_large(const float a, const float* x, float* result, +- unsigned int len, unsigned int rowsz) { +- unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz; +- if (idx < len) result[idx] = a * x[idx]; +-} +- +-void ProgramCU::ComputeSAX(float a, CuTexImage& texX, CuTexImage& result) { +- unsigned int len = texX.GetLength(); +- unsigned int bsize = 128; +- unsigned int nblock = (len + bsize - 1) / bsize; +- +- if (nblock > MAX_BLOCKLEN) { +- unsigned int bw, bh; +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- sax_kernel_large<<>>(a, texX.data(), result.data(), len, +- bw * bsize); +- } else { +- dim3 grid(nblock), block(bsize); +- sax_kernel<<>>(a, texX.data(), result.data(), len); +- } +- ProgramCU::CheckErrorCUDA("ComputeSAX"); +-} +- +-#define JACOBIAN_FRT_KWIDTH 64 +- +-texture tex_jacobian_cam; +-texture tex_jacobian_pts; +-texture tex_jacobian_idx; +-texture tex_jacobian_meas; +-texture tex_jacobian_sj; +-texture tex_jacobian_shuffle; +- +-#ifndef PBA_DISABLE_CONST_CAMERA +-#define JACOBIAN_SET_JC_BEGIN if (r3.w == 0.0f) { +-#define JFRT_SET_JC_END \ +- } \ +- else { \ +- jc[jc_pos] = make_float4(0, 0, 0, 0); \ +- jc[jc_pos + 1] = make_float4(0, 0, 0, 0); \ +- jc[jc_pos + 2] = make_float4(0, 0, 0, 0); \ +- jc[jc_pos + 3] = make_float4(0, 0, 0, 0); \ +- } +-#define JACOBIAN_SET_JC_END \ +- } \ +- else { \ +- jxc[0] = 0; \ +- jxc[1] = 0; \ +- jxc[2] = 0; \ +- jxc[3] = 0; \ +- jxc[4] = 0; \ +- jxc[5] = 0; \ +- jxc[6] = 0; \ +- jxc[7] = 0; \ +- jyc[0] = 0; \ +- jyc[1] = 0; \ +- jyc[2] = 0; \ +- jyc[3] = 0; \ +- jyc[4] = 0; \ +- jyc[5] = 0; \ +- jyc[6] = 0; \ +- jyc[7] = 0; \ +- } +-#else +-#define JACOBIAN_SET_JC_BEGIN +-#define JFRT_SET_JC_END +-#define JACOBIAN_SET_JC_END +-#endif +- +-// projection model ei = K(RX + T) - (1 + r * m^2) * m +-template +-__global__ void jacobian_frt_kernel(float4* jc, float4* jp, int nproj, int ptx, +- int rowsz, float jic) { +- //////////////////////////////// +- int tidx = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz; +- +- if (tidx >= nproj) return; +- int2 proj = tex1Dfetch(tex_jacobian_idx, tidx); +- int camera_pos = proj.x << 1; +- +- __shared__ float rr_data[JACOBIAN_FRT_KWIDTH * 9]; +- float* r = rr_data + IMUL(9, threadIdx.x); +- float4 ft = tex1Dfetch(tex_jacobian_cam, camera_pos); +- float4 r1 = tex1Dfetch(tex_jacobian_cam, camera_pos + 1); +- r[0] = r1.x; +- r[1] = r1.y; +- r[2] = r1.z; +- r[3] = r1.w; +- float4 r2 = tex1Dfetch(tex_jacobian_cam, camera_pos + 2); +- r[4] = r2.x; +- r[5] = r2.y; +- r[6] = r2.z; +- r[7] = r2.w; +- float4 r3 = tex1Dfetch(tex_jacobian_cam, camera_pos + 3); +- r[8] = r3.x; +- +- float4 temp = tex1Dfetch(tex_jacobian_pts, proj.y); +- float m[3]; +- m[0] = temp.x; +- m[1] = temp.y; +- m[2] = temp.z; +- +- float x0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2]; +- float y0 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2]; +- float z0 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2]; +- float f_p2 = FDIV(ft.x, z0 + ft.w); +- float p0_p2 = FDIV(x0 + ft.y, z0 + ft.w); +- float p1_p2 = FDIV(y0 + ft.z, z0 + ft.w); +- +- // dp/dx = [f/p2 0 -f*p0/p2/p2] +- // [0 f/p2 -f*p1/p2/p2] +- // dx/dw = [ 0 z -y] +- // [-z 0 x] +- // [ y -x 0] +- // R(dw) (x y z)' = (0 -z y)' dw0 + (z 0 -x)'dw1 + (-y x 0)'dw2 +- int jc_pos; +- if (shuffle) { +- jc_pos = tex1Dfetch(tex_jacobian_shuffle, tidx) << 2; +- } else { +- jc_pos = tidx << 2; +- } +- +- if (pd) { +- float rr1 = r3.y * p0_p2 * p0_p2; +- float rr2 = r3.y * p1_p2 * p1_p2; +- float f_p2_x = f_p2 * (1.0 + 3.0 * rr1 + rr2); +- float f_p2_y = f_p2 * (1.0 + 3.0 * rr2 + rr1); +- if (scaling == false) { +- if (jc) { +- JACOBIAN_SET_JC_BEGIN +- // float jic = (r3.w != 1.0f && r3.w != 2.0f) ? 1.0f : 0.0f; +- // float jec = (r3.w != 1.0f && r3.w != 3.0f) ? 1.0f : 0.0f; +- float jfc = jic * (1 + rr1 + rr2); +- float ft_x_pn = jic * ft.x * (p0_p2 * p0_p2 + p1_p2 * p1_p2); +- jc[jc_pos] = make_float4(p0_p2 * jfc, f_p2_x, 0, -f_p2_x * p0_p2); +- jc[jc_pos + 1] = +- make_float4(-f_p2_x * p0_p2 * y0, f_p2_x * (z0 + x0 * p0_p2), +- -f_p2_x * y0, ft_x_pn * p0_p2); +- jc[jc_pos + 2] = make_float4(p1_p2 * jfc, 0, f_p2_y, -f_p2 * p1_p2); +- jc[jc_pos + 3] = +- make_float4(-f_p2_y * (z0 + y0 * p1_p2), f_p2_y * x0 * p1_p2, +- f_p2_y * x0, ft_x_pn * p1_p2); +- JFRT_SET_JC_END +- } +- //////////////////// +- jp[(tidx << 1)] = make_float4(f_p2_x * (r[0] - r[6] * p0_p2), +- f_p2_x * (r[1] - r[7] * p0_p2), +- f_p2_x * (r[2] - r[8] * p0_p2), 0); +- jp[(tidx << 1) + 1] = make_float4(f_p2_y * (r[3] - r[6] * p1_p2), +- f_p2_y * (r[4] - r[7] * p1_p2), +- f_p2_y * (r[5] - r[8] * p1_p2), 0); +- } else { +- //////////////////// +- if (jc) { +- JACOBIAN_SET_JC_BEGIN +- float jfc = jic * (1 + rr1 + rr2); +- float ft_x_pn = jic * ft.x * (p0_p2 * p0_p2 + p1_p2 * p1_p2); +- float4 sc1 = tex1Dfetch(tex_jacobian_sj, proj.x); +- jc[jc_pos] = make_float4(p0_p2 * jfc * sc1.x, f_p2_x * sc1.y, 0, +- -f_p2_x * p0_p2 * sc1.w); +- jc[jc_pos + 2] = make_float4(p1_p2 * jfc * sc1.x, 0, f_p2_y * sc1.z, +- -f_p2_y * p1_p2 * sc1.w); +- +- float4 sc2 = tex1Dfetch(tex_jacobian_sj, proj.x + 1); +- jc[jc_pos + 1] = make_float4( +- -sc2.x * f_p2_x * p0_p2 * y0, sc2.y * f_p2_x * (z0 + x0 * p0_p2), +- -sc2.z * f_p2_x * y0, ft_x_pn * p0_p2 * sc2.w); +- jc[jc_pos + 3] = make_float4( +- -sc2.x * f_p2_y * (z0 + y0 * p1_p2), sc2.y * f_p2_y * x0 * p1_p2, +- sc2.z * f_p2_y * x0, ft_x_pn * p1_p2 * sc2.w); +- JFRT_SET_JC_END +- } +- +- float4 sc3 = tex1Dfetch(tex_jacobian_sj, proj.y + ptx); +- jp[(tidx << 1)] = make_float4(sc3.x * f_p2_x * (r[0] - r[6] * p0_p2), +- sc3.y * f_p2_x * (r[1] - r[7] * p0_p2), +- sc3.z * f_p2_x * (r[2] - r[8] * p0_p2), 0); +- jp[(tidx << 1) + 1] = +- make_float4(sc3.x * f_p2_y * (r[3] - r[6] * p1_p2), +- sc3.y * f_p2_y * (r[4] - r[7] * p1_p2), +- sc3.z * f_p2_y * (r[5] - r[8] * p1_p2), 0); +- } +- } else if (md) { +- if (scaling == false) { +- if (jc) { +- JACOBIAN_SET_JC_BEGIN +- float2 ms = tex1Dfetch(tex_jacobian_meas, tidx); +- float msn = (ms.x * ms.x + ms.y * ms.y) * jic; +- jc[jc_pos] = make_float4(p0_p2 * jic, f_p2, 0, -f_p2 * p0_p2); +- jc[jc_pos + 1] = +- make_float4(-f_p2 * p0_p2 * y0, f_p2 * (z0 + x0 * p0_p2), +- -f_p2 * y0, -ms.x * msn); +- jc[jc_pos + 2] = make_float4(p1_p2 * jic, 0, f_p2, -f_p2 * p1_p2); +- jc[jc_pos + 3] = make_float4(-f_p2 * (z0 + y0 * p1_p2), +- f_p2 * x0 * p1_p2, f_p2 * x0, -ms.y * msn); +- JFRT_SET_JC_END +- } +- //////////////////// +- jp[(tidx << 1)] = make_float4(f_p2 * (r[0] - r[6] * p0_p2), +- f_p2 * (r[1] - r[7] * p0_p2), +- f_p2 * (r[2] - r[8] * p0_p2), 0); +- jp[(tidx << 1) + 1] = make_float4(f_p2 * (r[3] - r[6] * p1_p2), +- f_p2 * (r[4] - r[7] * p1_p2), +- f_p2 * (r[5] - r[8] * p1_p2), 0); +- } else { +- if (jc) { +- JACOBIAN_SET_JC_BEGIN +- float4 sc1 = tex1Dfetch(tex_jacobian_sj, proj.x); +- jc[jc_pos] = make_float4(p0_p2 * jic * sc1.x, f_p2 * sc1.y, 0, +- -f_p2 * p0_p2 * sc1.w); +- jc[jc_pos + 2] = make_float4(p1_p2 * jic * sc1.x, 0, f_p2 * sc1.z, +- -f_p2 * p1_p2 * sc1.w); +- +- float4 sc2 = tex1Dfetch(tex_jacobian_sj, proj.x + 1); +- float2 ms = tex1Dfetch(tex_jacobian_meas, tidx); +- float msn = (ms.x * ms.x + ms.y * ms.y) * jic; +- jc[jc_pos + 1] = make_float4(-sc2.x * f_p2 * p0_p2 * y0, +- sc2.y * f_p2 * (z0 + x0 * p0_p2), +- -sc2.z * f_p2 * y0, -msn * ms.x * sc2.w); +- jc[jc_pos + 3] = make_float4(-sc2.x * f_p2 * (z0 + y0 * p1_p2), +- sc2.y * f_p2 * x0 * p1_p2, +- sc2.z * f_p2 * x0, -msn * ms.y * sc2.w); +- JFRT_SET_JC_END +- } +- float4 sc3 = tex1Dfetch(tex_jacobian_sj, proj.y + ptx); +- jp[(tidx << 1)] = make_float4(sc3.x * f_p2 * (r[0] - r[6] * p0_p2), +- sc3.y * f_p2 * (r[1] - r[7] * p0_p2), +- sc3.z * f_p2 * (r[2] - r[8] * p0_p2), 0); +- jp[(tidx << 1) + 1] = +- make_float4(sc3.x * f_p2 * (r[3] - r[6] * p1_p2), +- sc3.y * f_p2 * (r[4] - r[7] * p1_p2), +- sc3.z * f_p2 * (r[5] - r[8] * p1_p2), 0); +- } +- +- } else { +- if (scaling == false) { +- if (jc) { +- JACOBIAN_SET_JC_BEGIN +- jc[jc_pos] = make_float4(p0_p2 * jic, f_p2, 0, -f_p2 * p0_p2); +- jc[jc_pos + 1] = make_float4(-f_p2 * p0_p2 * y0, +- f_p2 * (z0 + x0 * p0_p2), -f_p2 * y0, 0); +- jc[jc_pos + 2] = make_float4(p1_p2 * jic, 0, f_p2, -f_p2 * p1_p2); +- jc[jc_pos + 3] = make_float4(-f_p2 * (z0 + y0 * p1_p2), +- f_p2 * x0 * p1_p2, f_p2 * x0, 0); +- JFRT_SET_JC_END +- } +- //////////////////// +- jp[(tidx << 1)] = make_float4(f_p2 * (r[0] - r[6] * p0_p2), +- f_p2 * (r[1] - r[7] * p0_p2), +- f_p2 * (r[2] - r[8] * p0_p2), 0); +- jp[(tidx << 1) + 1] = make_float4(f_p2 * (r[3] - r[6] * p1_p2), +- f_p2 * (r[4] - r[7] * p1_p2), +- f_p2 * (r[5] - r[8] * p1_p2), 0); +- } else { +- if (jc) { +- JACOBIAN_SET_JC_BEGIN +- float4 sc1 = tex1Dfetch(tex_jacobian_sj, proj.x); +- jc[jc_pos] = make_float4(p0_p2 * jic * sc1.x, f_p2 * sc1.y, 0, +- -f_p2 * p0_p2 * sc1.w); +- jc[jc_pos + 2] = make_float4(p1_p2 * jic * sc1.x, 0, f_p2 * sc1.z, +- -f_p2 * p1_p2 * sc1.w); +- float4 sc2 = tex1Dfetch(tex_jacobian_sj, proj.x + 1); +- jc[jc_pos + 1] = make_float4(-sc2.x * f_p2 * p0_p2 * y0, +- sc2.y * f_p2 * (z0 + x0 * p0_p2), +- -sc2.z * f_p2 * y0, 0); +- jc[jc_pos + 3] = +- make_float4(-sc2.x * f_p2 * (z0 + y0 * p1_p2), +- sc2.y * f_p2 * x0 * p1_p2, sc2.z * f_p2 * x0, 0); +- JFRT_SET_JC_END +- } +- +- float4 sc3 = tex1Dfetch(tex_jacobian_sj, proj.y + ptx); +- jp[(tidx << 1)] = make_float4(sc3.x * f_p2 * (r[0] - r[6] * p0_p2), +- sc3.y * f_p2 * (r[1] - r[7] * p0_p2), +- sc3.z * f_p2 * (r[2] - r[8] * p0_p2), 0); +- jp[(tidx << 1) + 1] = +- make_float4(sc3.x * f_p2 * (r[3] - r[6] * p1_p2), +- sc3.y * f_p2 * (r[4] - r[7] * p1_p2), +- sc3.z * f_p2 * (r[5] - r[8] * p1_p2), 0); +- } +- } +-} +- +-///////////////////////////////// +-void ProgramCU::ComputeJacobian(CuTexImage& camera, CuTexImage& point, +- CuTexImage& jc, CuTexImage& jp, +- CuTexImage& proj_map, CuTexImage& sj, +- CuTexImage& meas, CuTexImage& cmlist, +- bool intrinsic_fixed, int radial_distortion, +- bool shuffle) { +- float jfc = intrinsic_fixed ? 0.0f : 1.0f; +- unsigned int len = proj_map.GetImgWidth(); +- unsigned int bsize = JACOBIAN_FRT_KWIDTH; +- unsigned int nblock = (len + bsize - 1) / bsize; +- unsigned int bw, bh; +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- +- camera.BindTexture(tex_jacobian_cam); +- point.BindTexture(tex_jacobian_pts); +- proj_map.BindTexture(tex_jacobian_idx); +- +- if (!jc.IsValid()) shuffle = false; +- if (shuffle) cmlist.BindTexture(tex_jacobian_shuffle); +- if (sj.IsValid()) sj.BindTexture(tex_jacobian_sj); +- +- if (radial_distortion == -1) { +- meas.BindTexture(tex_jacobian_meas); +- if (sj.IsValid()) { +- if (shuffle) +- jacobian_frt_kernel<<>>( +- (float4*)jc.data(), (float4*)jp.data(), len, +- camera.GetImgWidth() * 2, bw * bsize, jfc); +- else +- jacobian_frt_kernel<<>>( +- (float4*)jc.data(), (float4*)jp.data(), len, +- camera.GetImgWidth() * 2, bw * bsize, jfc); +- } else { +- if (shuffle) +- jacobian_frt_kernel<<>>( +- (float4*)jc.data(), (float4*)jp.data(), len, +- camera.GetImgWidth() * 2, bw * bsize, jfc); +- else +- jacobian_frt_kernel<<>>( +- (float4*)jc.data(), (float4*)jp.data(), len, +- camera.GetImgWidth() * 2, bw * bsize, jfc); +- } +- } else if (radial_distortion) { +- if (sj.IsValid()) { +- if (shuffle) +- jacobian_frt_kernel<<>>( +- (float4*)jc.data(), (float4*)jp.data(), len, +- camera.GetImgWidth() * 2, bw * bsize, jfc); +- else +- jacobian_frt_kernel<<>>( +- (float4*)jc.data(), (float4*)jp.data(), len, +- camera.GetImgWidth() * 2, bw * bsize, jfc); +- } else { +- if (shuffle) +- jacobian_frt_kernel<<>>( +- (float4*)jc.data(), (float4*)jp.data(), len, +- camera.GetImgWidth() * 2, bw * bsize, jfc); +- else +- jacobian_frt_kernel<<>>( +- (float4*)jc.data(), (float4*)jp.data(), len, +- camera.GetImgWidth() * 2, bw * bsize, jfc); +- } +- } else { +- if (sj.IsValid()) { +- if (shuffle) +- jacobian_frt_kernel<<>>( +- (float4*)jc.data(), (float4*)jp.data(), len, +- camera.GetImgWidth() * 2, bw * bsize, jfc); +- else +- jacobian_frt_kernel<<>>( +- (float4*)jc.data(), (float4*)jp.data(), len, +- camera.GetImgWidth() * 2, bw * bsize, jfc); +- } else { +- if (shuffle) +- jacobian_frt_kernel<<>>( +- (float4*)jc.data(), (float4*)jp.data(), len, +- camera.GetImgWidth() * 2, bw * bsize, jfc); +- else +- jacobian_frt_kernel<<>>( +- (float4*)jc.data(), (float4*)jp.data(), len, +- camera.GetImgWidth() * 2, bw * bsize, jfc); +- } +- } +- +- ProgramCU::CheckErrorCUDA("ComputeJacobian"); +-} +- +-texture tex_compact_cam; +-__global__ void uncompress_frt_kernel(int ncam, float4* ucam) { +- int tidx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x; +- if (tidx >= ncam) return; +- int fetch_index = tidx << 1; +- int write_index = IMUL(tidx, 4); +- float4 temp1 = tex1Dfetch(tex_compact_cam, fetch_index); +- ucam[write_index] = temp1; +- +- float4 temp2 = tex1Dfetch(tex_compact_cam, fetch_index + 1); +- float rx = temp2.x; +- float ry = temp2.y; +- float rz = temp2.z; +- float rx_rx = rx * rx; +- float ry_ry = ry * ry; +- float rz_rz = rz * rz; +- float aa = sqrt(rx_rx + ry_ry + rz_rz); +- float caa, saa; +- sincosf(aa, &saa, &caa); +- float ct = aa == 0.0 ? 0.5 : FDIV2(1.0 - caa, aa * aa); +- float st = aa == 0.0 ? 1 : FDIV2(saa, aa); +- float rz_st = rz * st; +- float rx_st = rx * st; +- float ry_st = ry * st; +- float ry_ry_ct = ry_ry * ct; +- float rx_rx_ct = rx_rx * ct; +- float rz_rz_ct = rz_rz * ct; +- float rx_ry_ct = rx * ry * ct; +- float rz_rx_ct = rz * rx * ct; +- float ry_rz_ct = ry * rz * ct; +- +- //////////////////////////////////////////////////////////// +- ucam[write_index + 1] = +- make_float4((1.0 - (ry_ry_ct + rz_rz_ct)), (rx_ry_ct - rz_st), +- (rz_rx_ct + ry_st), (rx_ry_ct + rz_st)); +- +- ucam[write_index + 2] = +- make_float4((1.0 - (rz_rz_ct + rx_rx_ct)), (ry_rz_ct - rx_st), +- (rz_rx_ct - ry_st), (ry_rz_ct + rx_st)); +- +- ucam[write_index + 3] = +- make_float4((1.0 - (rx_rx_ct + ry_ry_ct)), temp2.w, 0, 0); +-} +- +-void ProgramCU::UncompressCamera(int ncam, CuTexImage& camera, +- CuTexImage& result) { +- unsigned int len = ncam; +- unsigned int bsize = 64; +- unsigned int nblock = (len + bsize - 1) / bsize; +- dim3 grid(nblock); +- dim3 block(bsize); +- camera.BindTexture(tex_compact_cam); +- uncompress_frt_kernel<<>>(len, (float4*)result.data()); +- CheckErrorCUDA("UncompressCamera"); +-} +- +-texture tex_uncompressed_cam; +- +-__global__ void compress_frt_kernel(int ncam, float4* zcam) { +- int tidx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x; +- if (tidx >= ncam) return; +- int fetch_index = tidx << 2; +- int write_index = tidx << 1; +- float4 temp1 = tex1Dfetch(tex_compact_cam, fetch_index); +- zcam[write_index] = temp1; +- +- float4 r1 = tex1Dfetch(tex_compact_cam, fetch_index + 1); +- float4 r2 = tex1Dfetch(tex_compact_cam, fetch_index + 2); +- float4 r3 = tex1Dfetch(tex_compact_cam, fetch_index + 3); +- +- float a = (r1.x + r2.x + r3.x - 1.0) / 2.0; +- if (a >= 1.0) { +- zcam[write_index + 1] = make_float4(0, 0, 0, 0); +- } else { +- float aa = acos(a), b = 0.5 * aa * rsqrt(1 - a * a); +- zcam[write_index + 1] = make_float4(b * (r2.w - r2.y), b * (r1.z - r2.z), +- b * (r1.w - r1.y), r3.y); +- } +-} +- +-void ProgramCU::CompressCamera(int ncam, CuTexImage& camera0, +- CuTexImage& result) { +- unsigned int len = ncam; +- unsigned int bsize = 64; +- unsigned int nblock = (len + bsize - 1) / bsize; +- dim3 grid(nblock), block(bsize); +- camera0.BindTexture(tex_uncompressed_cam); +- compress_frt_kernel<<>>(ncam, (float4*)result.data()); +- CheckErrorCUDA("CompressCamera"); +-} +- +-__device__ inline void uncompress_rodrigues_rotation(float rx, float ry, +- float rz, float* r) { +- float rx_rx = rx * rx; +- float ry_ry = ry * ry; +- float rz_rz = rz * rz; +- float aa = sqrt(rx_rx + ry_ry + rz_rz); +- float caa, saa; +- sincosf(aa, &saa, &caa); +- float ct = aa == 0.0 ? 0.5 : FDIV2(1.0 - caa, aa * aa); +- float st = aa == 0.0 ? 1 : FDIV2(saa, aa); +- float rz_st = rz * st; +- float rx_st = rx * st; +- float ry_st = ry * st; +- float ry_ry_ct = ry_ry * ct; +- float rx_rx_ct = rx_rx * ct; +- float rz_rz_ct = rz_rz * ct; +- float rx_ry_ct = rx * ry * ct; +- float rz_rx_ct = rz * rx * ct; +- float ry_rz_ct = ry * rz * ct; +- r[0] = (1.0 - (ry_ry_ct + rz_rz_ct)); +- r[1] = (rx_ry_ct - rz_st); +- r[2] = (rz_rx_ct + ry_st); +- r[3] = (rx_ry_ct + rz_st); +- r[4] = (1.0 - (rz_rz_ct + rx_rx_ct)); +- r[5] = (ry_rz_ct - rx_st); +- r[6] = (rz_rx_ct - ry_st); +- r[7] = (ry_rz_ct + rx_st); +- r[8] = (1.0 - (rx_rx_ct + ry_ry_ct)); +-} +- +-texture tex_update_cam; +-texture tex_update_cam_delta; +- +-__global__ void update_camera_kernel(int ncam, float4* newcam) { +- int tidx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x; +- if (tidx >= ncam) return; +- int index0 = tidx << 2; +- int index1 = tidx << 1; +- { +- float4 c1 = tex1Dfetch(tex_update_cam, index0); +- float4 d1 = tex1Dfetch(tex_update_cam_delta, index1); +- float4 c2 = make_float4(max(c1.x + d1.x, 1e-10f), c1.y + d1.y, c1.z + d1.z, +- c1.w + d1.w); +- newcam[index0] = c2; +- } +- { +- float r[9], dr[9]; //, nr[9]; +- float4 r1 = tex1Dfetch(tex_update_cam, index0 + 1); +- r[0] = r1.x; +- r[1] = r1.y; +- r[2] = r1.z; +- r[3] = r1.w; +- float4 r2 = tex1Dfetch(tex_update_cam, index0 + 2); +- r[4] = r2.x; +- r[5] = r2.y; +- r[6] = r2.z; +- r[7] = r2.w; +- float4 r3 = tex1Dfetch(tex_update_cam, index0 + 3); +- r[8] = r3.x; +- +- float4 dd = tex1Dfetch(tex_update_cam_delta, index1 + 1); +- uncompress_rodrigues_rotation(dd.x, dd.y, dd.z, dr); +- +- /////////////////////////////////////////////// +- newcam[index0 + 1] = +- make_float4(dr[0] * r[0] + dr[1] * r[3] + dr[2] * r[6], +- dr[0] * r[1] + dr[1] * r[4] + dr[2] * r[7], +- dr[0] * r[2] + dr[1] * r[5] + dr[2] * r[8], +- dr[3] * r[0] + dr[4] * r[3] + dr[5] * r[6]); +- newcam[index0 + 2] = +- make_float4(dr[3] * r[1] + dr[4] * r[4] + dr[5] * r[7], +- dr[3] * r[2] + dr[4] * r[5] + dr[5] * r[8], +- dr[6] * r[0] + dr[7] * r[3] + dr[8] * r[6], +- dr[6] * r[1] + dr[7] * r[4] + dr[8] * r[7]); +- newcam[index0 + 3] = make_float4(dr[6] * r[2] + dr[7] * r[5] + dr[8] * r[8], +- r3.y + dd.w, r3.z, r3.w); +- } +-} +- +-void ProgramCU::UpdateCameraPoint(int ncam, CuTexImage& camera, +- CuTexImage& point, CuTexImage& delta, +- CuTexImage& new_camera, CuTexImage& new_point, +- int mode) { +- if (mode != 2) { +- unsigned int len = ncam; +- unsigned int bsize = 64; +- unsigned int nblock = (len + bsize - 1) / bsize; +- dim3 grid(nblock), block(bsize); +- camera.BindTexture(tex_update_cam); +- delta.BindTexture(tex_update_cam_delta); +- update_camera_kernel<<>>(len, (float4*)new_camera.data()); +- CheckErrorCUDA("UpdateCamera"); +- } +- +- // update the points +- if (mode != 1) { +- CuTexImage dp; +- dp.SetTexture(delta.data() + 8 * ncam, point.GetLength()); +- ComputeSAXPY(1.0f, dp, point, new_point); +- CheckErrorCUDA("UpdatePoint"); +- } +-} +- +-#define PROJECTION_FRT_KWIDTH 64 +- +-texture tex_projection_cam; +-texture tex_projection_idx; +-texture tex_projection_pts; +-texture tex_projection_mea; +- +-// run 32/64/128 projections in a block +-template +-__global__ void projection_frt_kernel(int nproj, int rowsz, float2* pj) { +- //////////////////////////////// +- int tidx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz; +- if (tidx >= nproj) return; +- float f, m[3], t[3]; // r[9], +- __shared__ float rr_data[PROJECTION_FRT_KWIDTH * 9]; +- float* r = rr_data + IMUL(9, threadIdx.x); +- int2 proj = tex1Dfetch(tex_projection_idx, tidx); +- int cpos = proj.x << 1; +- float4 ft = tex1Dfetch(tex_projection_cam, cpos); +- f = ft.x; +- t[0] = ft.y; +- t[1] = ft.z; +- t[2] = ft.w; +- float4 r1 = tex1Dfetch(tex_projection_cam, cpos + 1); +- r[0] = r1.x; +- r[1] = r1.y; +- r[2] = r1.z; +- r[3] = r1.w; +- float4 r2 = tex1Dfetch(tex_projection_cam, cpos + 2); +- r[4] = r2.x; +- r[5] = r2.y; +- r[6] = r2.z; +- r[7] = r2.w; +- float4 r3 = tex1Dfetch(tex_projection_cam, cpos + 3); +- r[8] = r3.x; +- +- float4 temp = tex1Dfetch(tex_projection_pts, proj.y); +- m[0] = temp.x; +- m[1] = temp.y; +- m[2] = temp.z; +- +- float p0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2] + t[0]; +- float p1 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2] + t[1]; +- float p2 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2] + t[2]; +- +- if (pd) { +- float rr = 1.0 + r3.y * (p0 * p0 + p1 * p1) / (p2 * p2); +- float f_p2 = FDIV2(f * rr, p2); +- float2 ms = tex1Dfetch(tex_projection_mea, tidx); +- pj[tidx] = make_float2(ms.x - p0 * f_p2, ms.y - p1 * f_p2); +- } else if (md) { +- float f_p2 = FDIV2(f, p2); +- float2 ms = tex1Dfetch(tex_projection_mea, tidx); +- float rd = 1.0 + r3.y * (ms.x * ms.x + ms.y * ms.y); +- pj[tidx] = make_float2(ms.x * rd - p0 * f_p2, ms.y * rd - p1 * f_p2); +- } else { +- float f_p2 = FDIV2(f, p2); +- float2 ms = tex1Dfetch(tex_projection_mea, tidx); +- pj[tidx] = make_float2(ms.x - p0 * f_p2, ms.y - p1 * f_p2); +- } +-} +- +-void ProgramCU::ComputeProjection(CuTexImage& camera, CuTexImage& point, +- CuTexImage& meas, CuTexImage& proj_map, +- CuTexImage& proj, int radial) { +- unsigned int len = proj_map.GetImgWidth(); +- unsigned int bsize = PROJECTION_FRT_KWIDTH; +- unsigned int nblock = (len + bsize - 1) / bsize; +- camera.BindTexture(tex_projection_cam); +- point.BindTexture(tex_projection_pts); +- proj_map.BindTexture(tex_projection_idx); +- unsigned int bw, bh; +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- meas.BindTexture(tex_projection_mea); +- if (radial == -1) +- projection_frt_kernel<<>>(len, bw * bsize, +- (float2*)proj.data()); +- else if (radial) +- projection_frt_kernel<<>>(len, bw * bsize, +- (float2*)proj.data()); +- else +- projection_frt_kernel<<>>(len, bw * bsize, +- (float2*)proj.data()); +- CheckErrorCUDA("ComputeProjection"); +-} +- +-template +-__global__ void projectionx_frt_kernel(int nproj, int rowsz, float2* pj) { +- //////////////////////////////// +- int tidx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz; +- if (tidx >= nproj) return; +- float f, m[3], t[3]; // r[9], +- __shared__ float rr_data[PROJECTION_FRT_KWIDTH * 9]; +- float* r = rr_data + IMUL(9, threadIdx.x); +- int2 proj = tex1Dfetch(tex_projection_idx, tidx); +- int cpos = proj.x << 1; +- float4 ft = tex1Dfetch(tex_projection_cam, cpos); +- f = ft.x; +- t[0] = ft.y; +- t[1] = ft.z; +- t[2] = ft.w; +- float4 r1 = tex1Dfetch(tex_projection_cam, cpos + 1); +- r[0] = r1.x; +- r[1] = r1.y; +- r[2] = r1.z; +- r[3] = r1.w; +- float4 r2 = tex1Dfetch(tex_projection_cam, cpos + 2); +- r[4] = r2.x; +- r[5] = r2.y; +- r[6] = r2.z; +- r[7] = r2.w; +- float4 r3 = tex1Dfetch(tex_projection_cam, cpos + 3); +- r[8] = r3.x; +- +- float4 temp = tex1Dfetch(tex_projection_pts, proj.y); +- m[0] = temp.x; +- m[1] = temp.y; +- m[2] = temp.z; +- +- float p0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2] + t[0]; +- float p1 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2] + t[1]; +- float p2 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2] + t[2]; +- if (pd) { +- float rr = 1.0 + r3.y * (p0 * p0 + p1 * p1) / (p2 * p2); +- float f_p2 = FDIV2(f, p2); +- float2 ms = tex1Dfetch(tex_projection_mea, tidx); +- pj[tidx] = make_float2(ms.x / rr - p0 * f_p2, ms.y / rr - p1 * f_p2); +- } else if (md) { +- float f_p2 = FDIV2(f, p2); +- float2 ms = tex1Dfetch(tex_projection_mea, tidx); +- float rd = 1.0 + r3.y * (ms.x * ms.x + ms.y * ms.y); +- pj[tidx] = make_float2(ms.x - p0 * f_p2 / rd, ms.y - p1 * f_p2 / rd); +- } else { +- float f_p2 = FDIV2(f, p2); +- float2 ms = tex1Dfetch(tex_projection_mea, tidx); +- pj[tidx] = make_float2(ms.x - p0 * f_p2, ms.y - p1 * f_p2); +- } +-} +- +-void ProgramCU::ComputeProjectionX(CuTexImage& camera, CuTexImage& point, +- CuTexImage& meas, CuTexImage& proj_map, +- CuTexImage& proj, int radial) { +- unsigned int len = proj_map.GetImgWidth(); +- unsigned int bsize = PROJECTION_FRT_KWIDTH; +- unsigned int nblock = (len + bsize - 1) / bsize; +- camera.BindTexture(tex_projection_cam); +- point.BindTexture(tex_projection_pts); +- proj_map.BindTexture(tex_projection_idx); +- unsigned int bw, bh; +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- meas.BindTexture(tex_projection_mea); +- if (radial == -1) +- projectionx_frt_kernel<<>>(len, bw * bsize, +- (float2*)proj.data()); +- else if (radial) +- projectionx_frt_kernel<<>>(len, bw * bsize, +- (float2*)proj.data()); +- else +- projectionx_frt_kernel<<>>(len, bw * bsize, +- (float2*)proj.data()); +- CheckErrorCUDA("ComputeProjection"); +-} +- +-texture tex_jte_pe; +-texture tex_jte_pex; +-texture tex_jte_jc; +-texture tex_jte_jc2; +-texture tex_jte_cmp; +-texture tex_jte_cmt; +-texture tex_jte_jc3; +-texture tex_jte_jc4; +- +-__global__ void jte_cam_kernel(int num, float* jc, float* jte) { +- __shared__ float value[128]; +- +- // 8thread per camera +- int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x; +- if (col >= num) return; +- +- int cam = col >> 4; // 8 thread per camera +- +- // read data range for this camera, 8 thread will do the same thing +- int idx1 = tex1Dfetch(tex_jte_cmp, cam) << 4; // first camera +- int idx2 = tex1Dfetch(tex_jte_cmp, cam + 1) << 4; // last camera + 1 +- +- /////////////////////////////// +- int offset = threadIdx.x & 0xf; // which parameter of this camera +- int part = offset >= 8 ? 1 : 0; +- ///////////////////////////// +- +- float result = 0; +- // loop to read the index of the projection. +- // so to get the location to read the jacobian +- for (int i = idx1 + offset; i < idx2; i += 16) { +- float temp = jc[i]; +- // every 8 thread will read the same position. +- int index = tex1Dfetch(tex_jte_cmt, i >> 4); +- float v = tex1Dfetch(tex_jte_pex, (index << 1) + part); +- ////////////////////// +- result += temp * v; +- } +- value[threadIdx.x] = result; +- // write back +- if (offset < 8) jte[(cam << 3) + offset] = (result + value[threadIdx.x + 8]); +-} +- +-template +-__global__ void jte_cam_vec_kernel(int num, float* jte) { +- __shared__ float value[KH * 128]; +- int cam = blockIdx.x * KH + threadIdx.y; +- if (cam >= num) return; +- +- // read data range for this camera +- // 8 thread will do the same thing +- int idx1 = tex1Dfetch(tex_jte_cmp, cam) << 2; // first camera +- int idx2 = tex1Dfetch(tex_jte_cmp, cam + 1) << 2; // last camera + 1 +- int part = (threadIdx.x & 0x02) ? 1 : 0; +- +- float rx = 0, ry = 0, rz = 0, rw = 0; +- // loop to read the index of the projection. +- // so to get the location to read the jacobian +- for (int i = idx1 + threadIdx.x; i < idx2; i += 32) { +- float4 temp; +- if (TEXN == 1) { +- temp = tex1Dfetch(tex_jte_jc, i); +- } +- if (TEXN == 2) { +- int texid = i >> 25; +- if (texid == 0) +- temp = tex1Dfetch(tex_jte_jc, i); +- else +- temp = tex1Dfetch(tex_jte_jc2, (i & 0x1ffffff)); +- } +- if (TEXN == 4) { +- int index = tex1Dfetch(tex_jte_cmt, i >> 2); +- int iii = (index << 2) + (i & 0x3); +- int texid = iii >> 25; +- ///////////////////////////////// +- if (texid == 0) +- temp = tex1Dfetch(tex_jte_jc, iii); +- else if (texid == 1) +- temp = tex1Dfetch(tex_jte_jc2, (iii & 0x1ffffff)); +- else if (texid == 2) +- temp = tex1Dfetch(tex_jte_jc3, (iii & 0x1ffffff)); +- else +- temp = tex1Dfetch(tex_jte_jc4, (iii & 0x1ffffff)); +- } +- int index = tex1Dfetch(tex_jte_cmt, i >> 2); +- float vv = tex1Dfetch(tex_jte_pex, (index << 1) + part); +- rx += temp.x * vv; +- ry += temp.y * vv; +- rz += temp.z * vv; +- rw += temp.w * vv; +- } +- //////////////////////////////////// +- int widx = (threadIdx.y << 7) + (threadIdx.x << 2); +- /////////////////////////////////// +- // write back +- value[widx] = rx; +- value[widx + 1] = ry; +- value[widx + 2] = rz; +- value[widx + 3] = rw; +- //////////////////////////////////// +- int ridx = (threadIdx.y << 7) + threadIdx.x; +- value[ridx] = ((value[ridx] + value[ridx + 32]) + +- (value[ridx + 64] + value[ridx + 96])); +- if (threadIdx.x < 16) value[ridx] += value[ridx + 16]; +- if (threadIdx.x < 8) +- jte[(cam << 3) + threadIdx.x] = value[ridx] + value[ridx + 8]; +-} +- +-template +-__global__ void jte_cam_vec32_kernel(int num, float* jc, float* jte) { +- __shared__ float value[KH * 32]; +- int cam = blockIdx.x * KH + threadIdx.y; +- if (cam >= num) return; +- float sum = 0; +- int rowpos = (threadIdx.y << 5); +- int index = threadIdx.x + rowpos; +- int xypart = (threadIdx.x & 0x08) ? 1 : 0; +- int part2 = threadIdx.x & 0xf; +- // read data range for this camera +- // 8 thread will do the same thing +- int idx1 = tex1Dfetch(tex_jte_cmp, cam) << 4; // first camera +- int idx2 = tex1Dfetch(tex_jte_cmp, cam + 1) << 4; // last camera + 1 +- +- // loop to read the index of the projection. +- // so to get the location to read the jacobian +- for (int i = idx1 + threadIdx.x; i < idx2; i += 32) { +- int index = tex1Dfetch(tex_jte_cmt, i >> 4); +- float temp; +- if (JT) +- temp = jc[i]; +- else +- temp = jc[(index << 4) + part2]; +- +- float v = tex1Dfetch(tex_jte_pex, (index << 1) + xypart); +- sum += temp * v; +- } +- value[index] = sum; +- +- if (threadIdx.x < 16) value[index] += value[index + 16]; +- if (threadIdx.x < 8) +- jte[(cam << 3) + threadIdx.x] = value[index] + value[index + 8]; +-} +- +-///////////////////////////////////////////////////////////// +-texture tex_jte_jp; +-texture tex_jte_pmp; +-texture tex_jte_jp2; +- +-__global__ void jte_point_kernel(int num, float4* jte) { +- //////////////////////////// +- int index = blockIdx.x * blockDim.x + threadIdx.x; +- if (index >= num) return; +- +- int idx1 = tex1Dfetch(tex_jte_pmp, index); // first camera +- int idx2 = tex1Dfetch(tex_jte_pmp, index + 1); // last camera + 1 +- float4 result = make_float4(0, 0, 0, 0); +- for (int i = idx1; i < idx2; ++i) { +- // error vector +- float2 ev = tex1Dfetch(tex_jte_pe, i); +- +- float4 j1 = tex1Dfetch(tex_jte_jp, i << 1); +- result.x += j1.x * ev.x; +- result.y += j1.y * ev.x; +- result.z += j1.z * ev.x; +- +- float4 j2 = tex1Dfetch(tex_jte_jp, 1 + (i << 1)); +- result.x += j2.x * ev.y; +- result.y += j2.y * ev.y; +- result.z += j2.z * ev.y; +- } +- jte[index] = result; +-} +- +-//////////////////// +-// faster but not always more accurate +-//#define JTE_POINT_VEC2 +- +-template +-__global__ void jte_point_vec_kernel(int num, int rowsz, float* jte) { +- //////////////////////////// +- __shared__ float value[KH * 128]; +- int index = blockIdx.x * KH + threadIdx.y + blockIdx.y * rowsz; +- if (index >= num) return; +-#ifdef JTE_POINT_VEC2 +- int idx1 = tex1Dfetch(tex_jte_pmp, index); // first +- int idx2 = tex1Dfetch(tex_jte_pmp, index + 1); // last + 1 +-#else +- int idx1 = tex1Dfetch(tex_jte_pmp, index) << 1; // first +- int idx2 = tex1Dfetch(tex_jte_pmp, index + 1) << 1; // last + 1 +-#endif +- float rx = 0, ry = 0, rz = 0; +- for (int i = idx1 + threadIdx.x; i < idx2; i += 32) { +- if (TEXN == 2 && i >> 25) { +-#ifdef JTE_POINT_VEC2 +- +- float2 vv = tex1Dfetch(tex_jte_pe, i); +- float4 jp1 = tex1Dfetch(tex_jte_jp, ((i & 0x1ffffff) << 1)); +- float4 jp2 = tex1Dfetch(tex_jte_jp, ((i & 0x1ffffff) << 1) + 1); +- rx += (jp1.x * vv.x + jp2.x * vv.y); +- ry += (jp1.y * vv.x + jp2.y * vv.y); +- rz += (jp1.z * vv.x + jp2.z * vv.y); +-#else +- float vv = tex1Dfetch(tex_jte_pex, i); +- float4 jpi = tex1Dfetch(tex_jte_jp2, i & 0x1ffffff); +- rx += jpi.x * vv; +- ry += jpi.y * vv; +- rz += jpi.z * vv; +-#endif +- } else { +-#ifdef JTE_POINT_VEC2 +- float2 vv = tex1Dfetch(tex_jte_pe, i); +- float4 jp1 = tex1Dfetch(tex_jte_jp, (i << 1)); +- float4 jp2 = tex1Dfetch(tex_jte_jp, (i << 1) + 1); +- rx += (jp1.x * vv.x + jp2.x * vv.y); +- ry += (jp1.y * vv.x + jp2.y * vv.y); +- rz += (jp1.z * vv.x + jp2.z * vv.y); +-#else +- float vv = tex1Dfetch(tex_jte_pex, i); +- float4 jpi = tex1Dfetch(tex_jte_jp, i); +- rx += jpi.x * vv; +- ry += jpi.y * vv; +- rz += jpi.z * vv; +-#endif +- } +- } +- +- int rowp = threadIdx.y << 7; +- int loc = (threadIdx.x << 2) + rowp; +- value[loc] = rx; +- value[loc + 1] = ry; +- value[loc + 2] = rz; +- value[loc + 3] = 0; +- +- int ridx = threadIdx.x + rowp; +- value[ridx] = ((value[ridx] + value[ridx + 32]) + +- (value[ridx + 64] + value[ridx + 96])); +- if (threadIdx.x < 16) value[ridx] += value[ridx + 16]; +- if (threadIdx.x < 8) value[ridx] += value[ridx + 8]; +- if (threadIdx.x < 4) +- jte[(index << 2) + threadIdx.x] = value[ridx] + value[ridx + 4]; +-} +- +-#define JTE_CAMERA_VEC +-#define JTE_POINT_VEC +- +-void ProgramCU::ComputeJtE(CuTexImage& pe, CuTexImage& jc, CuTexImage& cmap, +- CuTexImage& cmlist, CuTexImage& jp, CuTexImage& pmap, +- CuTexImage& jte, bool jc_transpose, int mode) { +- ////////////////////////////////////////////////////////// +- int ncam = int(cmap.GetImgWidth() - 1); // how many cameras +- size_t szjc = jc.GetDataSize(); +- +- ////////////////////////////// +- cmap.BindTexture(tex_jte_cmp); +- cmlist.BindTexture(tex_jte_cmt); +-#ifdef JTE_CAMERA_VEC2 +- pe.BindTexture(tex_jte_pex); +- const unsigned int bheight = 2; +- dim3 block1(32, bheight), grid1((ncam + bheight - 1) / bheight); +- if (mode == 2) { +- } else if (jc_transpose) +- jte_cam_vec32_kernel<<>>(ncam, jc.data(), +- jte.data()); +- else +- jte_cam_vec32_kernel<<>>(ncam, jc.data(), +- jte.data()); +- +-#elif defined(JTE_CAMERA_VEC) +- pe.BindTexture(tex_jte_pex); +- const unsigned int bheight = 2; +- unsigned int len1 = ncam * 32; +- unsigned int bsize1 = 32 * bheight; +- unsigned int nblock1 = (len1 + bsize1 - 1) / bsize1; +- dim3 grid1(nblock1); +- dim3 block1(32, bheight); +- if (mode == 2) { +- // skip camera +- } else if (szjc > 2 * MAX_TEXSIZE || !jc_transpose) { +- if (jc_transpose) +- jte_cam_vec32_kernel<<>>(ncam, jc.data(), +- jte.data()); +- else +- jte_cam_vec32_kernel<<>>(ncam, jc.data(), +- jte.data()); +- } else if (szjc > MAX_TEXSIZE) { +- jc.BindTexture2(tex_jte_jc, tex_jte_jc2); +- jte_cam_vec_kernel<<>>(ncam, jte.data()); +- } else { +- jc.BindTexture(tex_jte_jc); +- jte_cam_vec_kernel<<>>(ncam, jte.data()); +- } +-#else +- pe.BindTexture(tex_jte_pex); +- unsigned int len1 = ncam * 16; +- unsigned int bsize1 = len1 > 32 * 128 ? 128 : (len1 > 32 * 64 ? 64 : 32); +- unsigned int nblock1 = (len1 + bsize1 - 1) / bsize1; +- dim3 grid1(nblock1), block1(bsize1); +- jte_cam_kernel<<>>(len1, jc.data(), jte.data()); +-#endif +- CheckErrorCUDA("ComputeJtE"); +- +- //////////////////////////////////////////// +- pmap.BindTexture(tex_jte_pmp); +- unsigned int npoint = (pmap.GetImgWidth() - 1); +-#ifndef JTE_POINT_VEC +- size_t len2 = npoint; +- unsigned int bsize2 = 64; +- unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2; +- dim3 grid2(nblock2), block2(bsize2); +- pe.BindTexture(tex_jte_pe); +- jp.BindTexture(tex_jte_jp); +- jte_point_kernel<<>>(len2, ((float4*)jte.data()) + 2 * ncam); +-#else +- +-#ifdef JTE_POINT_VEC2 +- pe.BindTexture(tex_jte_pe); +-#else +- pe.BindTexture(tex_jte_pex); +-#endif +- const unsigned int bheight2 = 2; +- unsigned int bsize2 = 32; +- unsigned int nblock2 = (unsigned int)((npoint + bheight2 - 1) / bheight2); +- unsigned int offsetv = 8 * ncam; +- unsigned int bw, bh; +- GetBlockConfiguration(nblock2, bw, bh); +- dim3 grid2(bw, bh), block2(bsize2, bheight2); +- if (mode == 1) { +- // skip point +- } else if (jp.GetDataSize() > MAX_TEXSIZE) { +- jp.BindTexture2(tex_jte_jp, tex_jte_jp2); +- jte_point_vec_kernel<<>>( +- npoint, bw * bheight2, ((float*)jte.data()) + offsetv); +- } else { +- jp.BindTexture(tex_jte_jp); +- jte_point_vec_kernel<<>>( +- npoint, bw * bheight2, ((float*)jte.data()) + offsetv); +- } +-#endif +- CheckErrorCUDA("ComputeJtE"); +-} +- +-texture tex_jtjd_cmp; +-texture tex_jtjd_cmlist; +- +-template +-__global__ void jtjd_cam_vec32_kernel(int num, int add_existing_dq, float* jc, +- float* jtjd, float* jtjdi) { +- __shared__ float value[KH * 32]; +- +- // 8thread per camera +- int cam = blockIdx.x * KH + threadIdx.y; +- int part = threadIdx.x & 0x7; // which parameter of this camera +- int part2 = threadIdx.x & 0xf; +- int campos = threadIdx.y << 5; +- int index = threadIdx.x + campos; +- float sum = 0; +- if (cam < num && part < VN) { +- // read data range for this camera +- // 8 thread will do the same thing +- int idx1 = tex1Dfetch(tex_jtjd_cmp, cam) << 4; // first camera +- int idx2 = tex1Dfetch(tex_jtjd_cmp, cam + 1) << 4; // last camera + 1 +- +- // loop to read the index of the projection. +- // so to get the location to read the jacobian +- for (int i = idx1 + threadIdx.x; i < idx2; i += 32) { +- if (JT) { +- float temp = jc[i]; +- sum += temp * temp; +- } else { +- int ii = tex1Dfetch(tex_jtjd_cmlist, i >> 4) << 4; +- float temp = jc[ii + part2]; +- sum += temp * temp; +- } +- } +- } +- __syncthreads(); +- +- if (cam >= num) return; +- // save all the results? +- value[index] = sum; +- if (threadIdx.x < 16) value[index] += value[index + 16]; +- if (threadIdx.x < 8) +- +- // write back +- if (threadIdx.x < 8) { +- float temp = value[index] + value[index + 8]; +- int wpos = threadIdx.x + (cam << 3); +- if (add_existing_dq) temp += jtjd[wpos]; +- jtjd[wpos] = temp; +- jtjdi[wpos] = temp == 0 ? 0 : 1 / (temp); +- } +-} +- +-texture tex_jtjd_jp; +-texture tex_jtjd_pmp; +-texture tex_jtjd_jp2; +- +-#define JTJD_POINT_KWIDTH 64 +- +-template +-__global__ void jtjd_point_kernel(int num, int rowsz, float4* jtjd, +- float4* jtjdi) { +- //////////////////////////// +- int index = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz; +- if (index >= num) return; +- +- int idx1 = tex1Dfetch(tex_jtjd_pmp, index); // first camera +- int idx2 = tex1Dfetch(tex_jtjd_pmp, index + 1); // last camera + 1 +- float rx = 0, ry = 0, rz = 0; +- for (int i = idx1; i < idx2; ++i) { +- if (TEXN == 2 && i > 0xffffff) { +- float4 j1 = tex1Dfetch(tex_jtjd_jp2, (i & 0xffffff) << 1); +- rx += j1.x * j1.x; +- ry += j1.y * j1.y; +- rz += j1.z * j1.z; +- +- float4 j2 = tex1Dfetch(tex_jtjd_jp2, 1 + ((i & 0xffffff) << 1)); +- rx += j2.x * j2.x; +- ry += j2.y * j2.y; +- rz += j2.z * j2.z; +- } else { +- float4 j1 = tex1Dfetch(tex_jtjd_jp, i << 1); +- rx += j1.x * j1.x; +- ry += j1.y * j1.y; +- rz += j1.z * j1.z; +- +- float4 j2 = tex1Dfetch(tex_jtjd_jp, 1 + (i << 1)); +- rx += j2.x * j2.x; +- ry += j2.y * j2.y; +- rz += j2.z * j2.z; +- } +- } +- +- if (jtjd) jtjd[index] = make_float4(rx, ry, rz, 0.0f); +- jtjdi[index] = make_float4(1.0f / rx, 1.0f / ry, 1.0f / rz, 0.0f); +-} +- +-void ProgramCU::ComputeDiagonal(CuTexImage& jc, CuTexImage& cmap, +- CuTexImage& jp, CuTexImage& pmap, +- CuTexImage& cmlist, CuTexImage& jtjd, +- CuTexImage& jtjdi, bool jc_transpose, +- int radial, bool add_existing_diagc) { +- ////////////////////////////////////////////////////////// +- size_t szjc = jc.GetDataSize(); +- unsigned int ncam = (cmap.GetImgWidth() - 1); // how many cameras +- +- const unsigned int bheight = 2; +- dim3 block1x(32, bheight), grid1x((ncam + bheight - 1) / bheight); +- cmap.BindTexture(tex_jtjd_cmp); +- if (jc_transpose) { +- if (radial) +- jtjd_cam_vec32_kernel<8, bheight, true><<>>( +- ncam, add_existing_diagc, jc.data(), jtjd.data(), jtjdi.data()); +- else +- jtjd_cam_vec32_kernel<7, bheight, true><<>>( +- ncam, add_existing_diagc, jc.data(), jtjd.data(), jtjdi.data()); +- } else { +- cmlist.BindTexture(tex_jtjd_cmlist); +- if (radial) +- jtjd_cam_vec32_kernel<8, bheight, false><<>>( +- ncam, add_existing_diagc, jc.data(), jtjd.data(), jtjdi.data()); +- else +- jtjd_cam_vec32_kernel<7, bheight, false><<>>( +- ncam, add_existing_diagc, jc.data(), jtjd.data(), jtjdi.data()); +- } +- CheckErrorCUDA("ComputeDiagonal"); +- +- //////////////////////////////////////////// +- unsigned int npoint = (pmap.GetImgWidth() - 1); +- unsigned int len2 = npoint; +- unsigned int bsize2 = JTJD_POINT_KWIDTH; +- unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2; +- unsigned int bw, bh; +- GetBlockConfiguration(nblock2, bw, bh); +- dim3 grid2(bw, bh), block2(bsize2); +- pmap.BindTexture(tex_jtjd_pmp); +- +- if (jp.GetDataSize() > MAX_TEXSIZE) { +- jp.BindTexture2(tex_jtjd_jp, tex_jtjd_jp2); +- jtjd_point_kernel<2><<>>(len2, (bw * bsize2), +- ((float4*)jtjd.data()) + 2 * ncam, +- ((float4*)jtjdi.data()) + 2 * ncam); +- } else { +- jp.BindTexture(tex_jtjd_jp); +- jtjd_point_kernel<1><<>>(len2, (bw * bsize2), +- ((float4*)jtjd.data()) + 2 * ncam, +- ((float4*)jtjdi.data()) + 2 * ncam); +- } +- CheckErrorCUDA("ComputeDiagonal"); +-} +- +-// for each +-template +-__global__ void jtjd_cam_q_kernel(int num, int rowsz, float* qw, float4* diag) { +- int bindex = IMUL(blockIdx.x, blockDim.x) + rowsz * blockIdx.y; +- int index = bindex + threadIdx.x; +- if (index >= num) return; +- int tid = index & 0x1; +- float w = qw[index], ws = w * w * 2.0f; +- if (SJ) { +- float4 sj = tex1Dfetch(tex_jacobian_sj, index); +- float4 dj = tid == 0 ? make_float4(sj.x * sj.x * ws, 0, 0, 0) +- : make_float4(0, 0, 0, sj.w * sj.w * ws); +- diag[index] = dj; +- } else { +- float4 dj = tid == 0 ? make_float4(ws, 0, 0, 0) : make_float4(0, 0, 0, ws); +- diag[index] = dj; +- } +-} +- +-void ProgramCU::ComputeDiagonalQ(CuTexImage& qlistw, CuTexImage& sj, +- CuTexImage& diag) { +- unsigned int bsize = 32; +- unsigned int len = qlistw.GetImgWidth() * 2; +- unsigned int nblock = (len + bsize - 1) / bsize; +- unsigned int bw, bh; +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- if (sj.IsValid()) { +- sj.BindTexture(tex_jacobian_sj); +- jtjd_cam_q_kernel<<>>(len, (bw * bsize), qlistw.data(), +- (float4*)diag.data()); +- } else { +- jtjd_cam_q_kernel<<>>(len, (bw * bsize), qlistw.data(), +- (float4*)diag.data()); +- } +- CheckErrorCUDA("ComputeDiagonalQ"); +-} +- +-template +-__global__ void jtjd_cam_block_vec32_kernel(int num, float lambda1, +- float lambda2, float* jc, +- float* diag, float* blocks, +- bool add_existing_diagc) { +- __shared__ float value[KH * 32 * VN]; +- +- // 8thread per camera +- int cam = blockIdx.x * KH + threadIdx.y; +- int part = threadIdx.x & 0x7; // which parameter of this camera +- int part2 = threadIdx.x & 0xf; +- int index = threadIdx.x + (threadIdx.y << 5); +- float row[8] = {0, 0, 0, 0, 0, 0, 0, 0}; +- if (cam < num) { +- int rowpos = index - part; +- // read data range for this camera +- // 8 thread will do the same thing +- int idx1 = tex1Dfetch(tex_jtjd_cmp, cam) << 4; // first camera +- int idx2 = tex1Dfetch(tex_jtjd_cmp, cam + 1) << 4; // last camera + 1 +- +- // loop to read the index of the projection. +- // so to get the location to read the jacobian +- for (int i = idx1 + threadIdx.x; i < idx2; i += 32) { +- if (JT) { +- float temp = jc[i]; +- value[index] = temp; +- for (int j = 0; j < VN; ++j) row[j] += (temp * value[rowpos + j]); +- } else { +- int ii = tex1Dfetch(tex_jtjd_cmlist, i >> 4) << 4; +- float temp = jc[ii + part2]; +- value[index] = temp; +- for (int j = 0; j < VN; ++j) row[j] += (temp * value[rowpos + j]); +- } +- } +- } +- __syncthreads(); +- +- if (cam >= num) return; +- // save all the results? +- for (int i = 0; i < VN; ++i) value[index * VN + i] = row[i]; +- int campos = threadIdx.y * (32 * VN); +- for (int i = threadIdx.x; i < (VN * 16); i += 32) +- value[campos + i] += value[campos + i + (16 * VN)]; +- for (int i = threadIdx.x; i < (VN * 8); i += 32) +- value[campos + i] += value[campos + i + (8 * VN)]; +- +- if (VN == 7) { +- bool zero = (part >= VN); +- +- // write back +- if (threadIdx.x < 8) { +- float* dp = value + campos + threadIdx.x * (VN + 1); +- float temp = zero ? 0 : dp[0]; +- int didx = threadIdx.x + (cam << 3); +- if (add_existing_diagc) temp += diag[didx]; +- diag[didx] = temp; +- dp[0] = lambda1 + lambda2 * temp; +- } +- int wpos = cam * (8 * VN) + threadIdx.x; +- int rpos = campos + threadIdx.x - (threadIdx.x >> 3); +- blocks[wpos] = zero ? 0 : value[rpos]; +- if (threadIdx.x < (VN * 8 - 32)) +- blocks[wpos + 32] = zero ? 0 : value[rpos + 28]; +- } else { +- // write back +- if (threadIdx.x < 8) { +- float* dp = value + campos + threadIdx.x * (VN + 1); +- float temp = dp[0]; +- int didx = threadIdx.x + (cam << 3); +- if (add_existing_diagc) temp += diag[didx]; +- diag[didx] = temp; +- dp[0] = lambda1 + lambda2 * temp; // max(, 1e-6) ; +- } +- int wpos = cam * (8 * VN) + threadIdx.x; +- int rpos = campos + threadIdx.x; +- blocks[wpos] = value[rpos]; +- blocks[wpos + 32] = value[rpos + 32]; +- } +-} +- +-#define JTJD_POINT_BLOCK_KWIDTH 64 +- +-template +-__global__ void jtjd_point_block_kernel(int num, int rowsz, float lambda1, +- float lambda2, float4* diag, +- float4* blocks) { +- //////////////////////////// +- int index = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz; +- if (index >= num) return; +- +- int idx1 = tex1Dfetch(tex_jtjd_pmp, index); // first camera +- int idx2 = tex1Dfetch(tex_jtjd_pmp, index + 1); // last camera + 1 +- +- float M00 = 0, M01 = 0, M02 = 0, M11 = 0, M12 = 0, M22 = 0; +- for (int i = idx1; i < idx2; ++i) { +- if (TEXN == 2 && i > 0xffffff) { +- float4 j1 = tex1Dfetch(tex_jtjd_jp2, (i & 0xffffff) << 1); +- M00 += j1.x * j1.x; +- M01 += j1.x * j1.y; +- M02 += j1.x * j1.z; +- M11 += j1.y * j1.y; +- M12 += j1.y * j1.z; +- M22 += j1.z * j1.z; +- +- float4 j2 = tex1Dfetch(tex_jtjd_jp2, 1 + ((i & 0xffffff) << 1)); +- M00 += j2.x * j2.x; +- M01 += j2.x * j2.y; +- M02 += j2.x * j2.z; +- M11 += j2.y * j2.y; +- M12 += j2.y * j2.z; +- M22 += j2.z * j2.z; +- } else { +- float4 j1 = tex1Dfetch(tex_jtjd_jp, i << 1); +- M00 += j1.x * j1.x; +- M01 += j1.x * j1.y; +- M02 += j1.x * j1.z; +- M11 += j1.y * j1.y; +- M12 += j1.y * j1.z; +- M22 += j1.z * j1.z; +- +- float4 j2 = tex1Dfetch(tex_jtjd_jp, 1 + (i << 1)); +- M00 += j2.x * j2.x; +- M01 += j2.x * j2.y; +- M02 += j2.x * j2.z; +- M11 += j2.y * j2.y; +- M12 += j2.y * j2.z; +- M22 += j2.z * j2.z; +- } +- } +- +- diag[index] = make_float4(M00, M11, M22, 0); +- +- M00 = lambda2 * M00 + lambda1; +- M11 = lambda2 * M11 + lambda1; +- M22 = lambda2 * M22 + lambda1; +- +- // invert the 3x3 matrix. +- float det = (M00 * M11 - M01 * M01) * M22 + 2.0 * M01 * M12 * M02 - +- M02 * M02 * M11 - M12 * M12 * M00; +- if (det >= FLT_MAX || det <= FLT_MIN * 2.0f) { +- int write_pos = index * 3; +- blocks[write_pos] = make_float4(0, 0, 0, 0); +- blocks[write_pos + 1] = make_float4(0, 0, 0, 0); +- blocks[write_pos + 2] = make_float4(0, 0, 0, 0); +- } else { +- float m00 = (M11 * M22 - M12 * M12) / det; +- float m01 = -(M01 * M22 - M12 * M02) / det; +- float m02 = (M01 * M12 - M02 * M11) / det; +- int write_pos = index * 3; +- blocks[write_pos] = make_float4(m00, m01, m02, 0); +- +- float m11 = (M00 * M22 - M02 * M02) / det; +- float m12 = -(M00 * M12 - M01 * M02) / det; +- blocks[write_pos + 1] = make_float4(m01, m11, m12, 0); +- +- float m22 = (M00 * M11 - M01 * M01) / det; +- blocks[write_pos + 2] = make_float4(m02, m12, m22, 0); +- } +-} +- +-#define JTJD_BLOCK_CAM_INVERT_KWIDTH 64 +-template +-__global__ void jtjd_cam_block_invert_kernel(int num, float4* blocks) { +- // N / 8 cameras...each have 64 floats,,,, N * 8 float +- // each will read 8 float...... +- __shared__ float value[JTJD_BLOCK_CAM_INVERT_KWIDTH * VN]; +- __shared__ bool invalid[JTJD_BLOCK_CAM_INVERT_KWIDTH / 8]; +- ////////////////////////////////////////////// +- +- int bindex = IMUL(blockIdx.x, blockDim.x); +- int index = bindex + threadIdx.x; +- int block_read_pos = IMUL(bindex, VN); +- for (int i = 0; i < JTJD_BLOCK_CAM_INVERT_KWIDTH * VN; +- i += JTJD_BLOCK_CAM_INVERT_KWIDTH) +- value[threadIdx.x + i] = ((float*)blocks)[block_read_pos + threadIdx.x + i]; +- __syncthreads(); +- const int cam_id = threadIdx.x >> 3; +- const int cam_pos = IMUL(cam_id, VN * 8); +- const int col = threadIdx.x & 0x7, rowj_pos = col << 3; +- ; // +- +- float* a = value + cam_pos; +- for (int i = 0; i < VN; ++i) { +- int rowi_pos = i << 3, dpos = i + rowi_pos; +- if (col == i && a[dpos] > 0) a[dpos] = rsqrt(a[dpos]); +- __syncthreads(); +- float diag = a[dpos]; +- if (diag == 0 || col >= VN) continue; +- if (col < i) { +- a[rowi_pos + col] = 0; +- } else if (col > i) { +- float aij = a[rowi_pos + col] * diag; +- a[rowi_pos + col] = aij; +- for (int k = col; k < VN; ++k) a[rowj_pos + k] -= a[rowi_pos + k] * aij; +- } +- } +- +- if (index >= num) return; +- +- if (col == 0) invalid[cam_id] = false; +- if (col < VN) { +- for (int i = 1; i < VN; ++i) { +- int rowi_pos = i << 3, dpos = i + rowi_pos; +- if (a[dpos] == 0) continue; +- if (col < i) { +- float sum = 0; +- for (int k = col; k < i; ++k) +- sum += (a[(k << 3) + i] * a[rowj_pos + k]); +- a[rowj_pos + i] = -sum * a[dpos]; +- } +- } +- float ai[8], amax = 0; +- for (int i = 0; i < VN * 8; i += 8) { +- float sum = 0; +- for (int k = 0; k < VN; k++) sum += a[rowj_pos + k] * a[i + k]; +- ai[i >> 3] = sum; +- amax = max(amax, sum); +- } +- +- if (isinf(amax)) invalid[cam_id] = true; +- int write_pos = IMUL((index >> 3), (VN * 2)) + (col << 1); +- if (invalid[cam_id]) // a better way would be using a threshold +- { +- blocks[write_pos] = make_float4(0, 0, 0, 0); +- blocks[write_pos + 1] = make_float4(0, 0, 0, 0); +- } else { +- blocks[write_pos] = make_float4(ai[0], ai[1], ai[2], ai[3]); +- blocks[write_pos + 1] = +- make_float4(ai[4], ai[5], ai[6], VN < 8 ? 0 : ai[7]); +- } +- } +-} +- +-void ProgramCU::ComputeDiagonalBlock(float lambda, bool dampd, CuTexImage& jc, +- CuTexImage& cmap, CuTexImage& jp, +- CuTexImage& pmap, CuTexImage& cmlist, +- CuTexImage& diag, CuTexImage& blocks, +- int radial_distortion, bool jc_transpose, +- bool add_existing_diagc, int mode) { +- size_t szjc = jc.GetDataSize(); +- unsigned int ncam = (cmap.GetImgWidth() - 1); // how many cameras +- float lambda1 = dampd ? 0.0f : lambda; +- float lambda2 = dampd ? (1.0f + lambda) : 1.0f; +- const unsigned int bheight = 2; +- dim3 block1x(32, bheight), grid1x((ncam + bheight - 1) / bheight); +- cmap.BindTexture(tex_jtjd_cmp); +- +- if (mode == 2) { +- // point only mode? +- } else if (radial_distortion) { +- if (jc_transpose) { +- jtjd_cam_block_vec32_kernel<8, bheight, true><<>>( +- ncam, lambda1, lambda2, jc.data(), diag.data(), blocks.data(), +- add_existing_diagc); +- } else { +- cmlist.BindTexture(tex_jtjd_cmlist); +- jtjd_cam_block_vec32_kernel<8, bheight, false><<>>( +- ncam, lambda1, lambda2, jc.data(), diag.data(), blocks.data(), +- add_existing_diagc); +- } +- } else { +- if (jc_transpose) { +- jtjd_cam_block_vec32_kernel<7, bheight, true><<>>( +- ncam, lambda1, lambda2, jc.data(), diag.data(), blocks.data(), +- add_existing_diagc); +- } else { +- cmlist.BindTexture(tex_jtjd_cmlist); +- jtjd_cam_block_vec32_kernel<7, bheight, false><<>>( +- ncam, lambda1, lambda2, jc.data(), diag.data(), blocks.data(), +- add_existing_diagc); +- } +- } +- CheckErrorCUDA("ComputeDiagonalBlock"); +- +- //////////////////////////////////////////// +- unsigned int npoint = (pmap.GetImgWidth() - 1); +- unsigned int len2 = npoint; +- unsigned int bsize2 = JTJD_POINT_BLOCK_KWIDTH; +- unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2; +- unsigned int bw, bh; +- unsigned int offsetd = 2 * ncam; +- unsigned int offsetb = (radial_distortion ? 16 : 14) * ncam; +- GetBlockConfiguration(nblock2, bw, bh); +- dim3 grid2(bw, bh), block2(bsize2); +- pmap.BindTexture(tex_jtjd_pmp); +- if (mode == 1) { +- // camera only mode? +- } else if (jp.GetDataSize() > MAX_TEXSIZE) { +- jp.BindTexture2(tex_jtjd_jp, tex_jtjd_jp2); +- jtjd_point_block_kernel<2><<>>( +- len2, (bw * bsize2), lambda1, lambda2, ((float4*)diag.data()) + offsetd, +- ((float4*)blocks.data()) + offsetb); +- } else { +- jp.BindTexture(tex_jtjd_jp); +- jtjd_point_block_kernel<1><<>>( +- len2, (bw * bsize2), lambda1, lambda2, ((float4*)diag.data()) + offsetd, +- ((float4*)blocks.data()) + offsetb); +- } +- CheckErrorCUDA("ComputeDiagonalBlock"); +- +- if (mode != 2) { +- unsigned int len3 = ncam * 8; +- unsigned int bsize3 = JTJD_BLOCK_CAM_INVERT_KWIDTH; +- unsigned int nblock3 = (len3 + bsize3 - 1) / bsize3; +- dim3 grid3(nblock3), block3(bsize3); +- if (radial_distortion) +- jtjd_cam_block_invert_kernel<8><<>>( +- len3, (float4*)blocks.data()); +- else +- jtjd_cam_block_invert_kernel<7><<>>( +- len3, (float4*)blocks.data()); +- CheckErrorCUDA("ComputeDiagonalBlockInverse"); +- } +-} +- +-template +-__global__ void multiply_block_conditioner_kernel(int num, int rowsz, +- float* blocks, float* x, +- float* result) { +- __shared__ float mat[WIDTH * VSZ]; +- __shared__ float val[WIDTH]; +- const int BSZ = 1 << BBIT; +- const int BMASK = BSZ - 1; +- int bindex = IMUL(blockIdx.x, blockDim.x) + rowsz * blockIdx.y; +- int index = bindex + threadIdx.x; +- int block_read_pos = bindex * VSZ; +- val[threadIdx.x] = x[index]; +- for (int i = 0; i < VSZ * WIDTH; i += WIDTH) +- mat[i + threadIdx.x] = blocks[i + block_read_pos + threadIdx.x]; +- __syncthreads(); +- if (index >= num) return; +- float* ac = mat + (threadIdx.x >> BBIT) * (BSZ * VSZ) + (threadIdx.x & BMASK); +- float* xc = val + (threadIdx.x & (~BMASK)); +- float sum = 0; +- for (int i = 0; i < VSZ; ++i) sum += ac[i << BBIT] * xc[i]; +- result[index] = sum; // isinf(sum) ? 0 : sum ; // +-} +- +-void ProgramCU::MultiplyBlockConditioner(int ncam, int npoint, +- CuTexImage& blocks, CuTexImage& vector, +- CuTexImage& result, int radial, +- int mode) { +- const unsigned int bsize1 = 64; +- unsigned int bw, bh; +- +- if (mode != 2) { +- unsigned int len1 = ncam * 8; +- unsigned int nblock1 = (len1 + bsize1 - 1) / bsize1; +- GetBlockConfiguration(nblock1, bw, bh); +- dim3 grid1(bw, bh), block1(bsize1); +- if (radial) +- multiply_block_conditioner_kernel<<>>( +- len1, (bw * bsize1), blocks.data(), vector.data(), result.data()); +- else +- multiply_block_conditioner_kernel<<>>( +- len1, (bw * bsize1), blocks.data(), vector.data(), result.data()); +- CheckErrorCUDA("MultiplyBlockConditioner"); +- } +- +- if (mode != 1) { +- const unsigned int bsize2 = 128; +- unsigned int len2 = npoint * 4; +- unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2; +- unsigned int cbsz = radial ? 64 : 56; +- unsigned int offsetb = ncam * cbsz; +- unsigned int offsetd = ncam * 8; +- GetBlockConfiguration(nblock2, bw, bh); +- dim3 grid2(bw, bh), block2(bsize2); +- multiply_block_conditioner_kernel<<>>( +- len2, (bw * bsize2), blocks.data() + offsetb, vector.data() + offsetd, +- result.data() + offsetd); +- CheckErrorCUDA("MultiplyBlockConditioner"); +- } +-} +- +-texture tex_shuffle_jc; +-texture tex_shuffle_map; +-texture tex_shuffle_jc2; +-template +-__global__ void shuffle_camera_jacobian_kernel(int num, int bwidth, +- float4* jc) { +- int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth; +- if (index >= num) return; +- int fetch_idx = tex1Dfetch(tex_shuffle_map, index >> 2); +- if (TEXN == 2) { +- int texidx = fetch_idx >> 23, +- fidx = ((fetch_idx & 0x7fffff) << 2) + (index & 0x3); +- if (texidx == 0) +- jc[index] = tex1Dfetch(tex_shuffle_jc, fidx); +- else if (texidx == 1) +- jc[index] = tex1Dfetch(tex_shuffle_jc2, fidx); +- } +- if (TEXN == 1) { +- jc[index] = tex1Dfetch(tex_shuffle_jc, (fetch_idx << 2) + (index & 0x3)); +- } +-} +- +-bool ProgramCU::ShuffleCameraJacobian(CuTexImage& jc, CuTexImage& map, +- CuTexImage& result) { +- if (!result.IsValid()) return false; +- size_t szjc = jc.GetDataSize(); +- unsigned int len = map.GetImgWidth() * 4; +- unsigned int bsize = 128; +- unsigned int nblock = (len + bsize - 1) / bsize; +- +- map.BindTexture(tex_shuffle_map); +- +- if (szjc > 2 * MAX_TEXSIZE) { +- fprintf(stderr, "datasize way too big %lX, %lX+...\n", szjc, +- (szjc) / MAX_TEXSIZE); +- return false; +- } else if (szjc > MAX_TEXSIZE) { +- unsigned int bw, bh; +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- jc.BindTexture2(tex_shuffle_jc, tex_shuffle_jc2); +- shuffle_camera_jacobian_kernel<2><<>>(len, (bw * bsize), +- (float4*)result.data()); +- } else { +- jc.BindTexture(tex_shuffle_jc); +- unsigned int bw, bh; +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- shuffle_camera_jacobian_kernel<1><<>>(len, (bw * bsize), +- (float4*)result.data()); +- } +- CheckErrorCUDA("ShuffleCameraJacobian"); +- return true; +-} +- +-texture tex_mjx_jc; +-texture tex_mjx_jc2; +-texture tex_mjx_jc3; +-texture tex_mjx_jc4; +-texture tex_mjx_jp; +-texture tex_mjx_jp2; +-texture tex_mjx_idx; +-texture tex_mjx_x; +- +-template +-__global__ void multiply_jx_kernel(int num, int bwidth, int offset, +- float* result) { +- int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth; +- if (index >= num) return; +- +- if (TEXN == 4 && (index >> 24) == 3) { +- //////////////////////////////////////////// +- int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1); +- float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x); +- float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1); +- float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset); +- +- //////////////////////////////////////////// +- float4 jp, jc1, jc2; +- jp = tex1Dfetch(tex_mjx_jp2, index & 0x1ffffff); +- jc1 = tex1Dfetch(tex_mjx_jc4, (index & 0xffffff) << 1); +- jc2 = tex1Dfetch(tex_mjx_jc4, ((index & 0xffffff) << 1) + 1); +- +- ///////////////////////////////////// +- result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z + +- jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y + +- jc2.z * xc2.z + jc2.w * xc2.w + jp.x * xp.x + jp.y * xp.y + +- jp.z * xp.z; +- } else if (TEXN > 2 && (index >> 24) == 2) { +- //////////////////////////////////////////// +- int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1); +- float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x); +- float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1); +- float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset); +- +- //////////////////////////////////////////// +- float4 jp, jc1, jc2; +- jp = tex1Dfetch(tex_mjx_jp2, index & 0x1ffffff); +- jc1 = tex1Dfetch(tex_mjx_jc3, (index & 0xffffff) << 1); +- jc2 = tex1Dfetch(tex_mjx_jc3, ((index & 0xffffff) << 1) + 1); +- +- ///////////////////////////////////// +- result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z + +- jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y + +- jc2.z * xc2.z + jc2.w * xc2.w + jp.x * xp.x + jp.y * xp.y + +- jp.z * xp.z; +- } else if (TEXN > 1 && (index > 0xffffff)) { +- //////////////////////////////////////////// +- int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1); +- float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x); +- float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1); +- float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset); +- +- //////////////////////////////////////////// +- float4 jp, jc1, jc2; +- jp = tex1Dfetch(tex_mjx_jp, index & 0x1ffffff); +- jc1 = tex1Dfetch(tex_mjx_jc2, (index & 0xffffff) << 1); +- jc2 = tex1Dfetch(tex_mjx_jc2, ((index & 0xffffff) << 1) + 1); +- +- ///////////////////////////////////// +- result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z + +- jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y + +- jc2.z * xc2.z + jc2.w * xc2.w + jp.x * xp.x + jp.y * xp.y + +- jp.z * xp.z; +- } else { +- //////////////////////////////////////////// +- int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1); +- float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x); +- float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1); +- float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset); +- +- //////////////////////////////////////////// +- float4 jp, jc1, jc2; +- jp = tex1Dfetch(tex_mjx_jp, index); +- jc1 = tex1Dfetch(tex_mjx_jc, index << 1); +- jc2 = tex1Dfetch(tex_mjx_jc, (index << 1) + 1); +- +- ///////////////////////////////////// +- result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z + +- jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y + +- jc2.z * xc2.z + jc2.w * xc2.w + jp.x * xp.x + jp.y * xp.y + +- jp.z * xp.z; +- } +-} +- +-template +-__global__ void multiply_jcx_kernel(int num, int bwidth, float* result) { +- int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth; +- if (index >= num) return; +- +- if (TEXN == 4 && (index >> 24) == 3) { +- //////////////////////////////////////////// +- int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1); +- float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x); +- float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1); +- +- //////////////////////////////////////////// +- float4 jc1, jc2; +- jc1 = tex1Dfetch(tex_mjx_jc4, (index & 0xffffff) << 1); +- jc2 = tex1Dfetch(tex_mjx_jc4, ((index & 0xffffff) << 1) + 1); +- +- ///////////////////////////////////// +- result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z + +- jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y + +- jc2.z * xc2.z + jc2.w * xc2.w; +- } else if (TEXN > 2 && (index >> 24) == 2) { +- //////////////////////////////////////////// +- int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1); +- float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x); +- float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1); +- +- //////////////////////////////////////////// +- float4 jc1, jc2; +- jc1 = tex1Dfetch(tex_mjx_jc3, (index & 0xffffff) << 1); +- jc2 = tex1Dfetch(tex_mjx_jc3, ((index & 0xffffff) << 1) + 1); +- +- ///////////////////////////////////// +- result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z + +- jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y + +- jc2.z * xc2.z + jc2.w * xc2.w; +- } else if (TEXN > 1 && (index > 0xffffff)) { +- //////////////////////////////////////////// +- int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1); +- float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x); +- float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1); +- +- //////////////////////////////////////////// +- float4 jc1, jc2; +- jc1 = tex1Dfetch(tex_mjx_jc2, (index & 0xffffff) << 1); +- jc2 = tex1Dfetch(tex_mjx_jc2, ((index & 0xffffff) << 1) + 1); +- +- ///////////////////////////////////// +- result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z + +- jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y + +- jc2.z * xc2.z + jc2.w * xc2.w; +- } else { +- //////////////////////////////////////////// +- int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1); +- float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x); +- float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1); +- +- //////////////////////////////////////////// +- float4 jc1, jc2; +- jc1 = tex1Dfetch(tex_mjx_jc, index << 1); +- jc2 = tex1Dfetch(tex_mjx_jc, (index << 1) + 1); +- +- ///////////////////////////////////// +- result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z + +- jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y + +- jc2.z * xc2.z + jc2.w * xc2.w; +- } +-} +- +-template +-__global__ void multiply_jpx_kernel(int num, int bwidth, int offset, +- float* result) { +- int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth; +- if (index >= num) return; +- +- if (TEXN == 2 && index > 0x1ffffff) { +- //////////////////////////////////////////// +- int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1); +- float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset); +- //////////////////////////////////////////// +- float4 jp = tex1Dfetch(tex_mjx_jp2, index & 0x1ffffff); +- ///////////////////////////////////// +- result[index] = jp.x * xp.x + jp.y * xp.y + jp.z * xp.z; +- } else { +- //////////////////////////////////////////// +- int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1); +- float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset); +- +- //////////////////////////////////////////// +- float4 jp = tex1Dfetch(tex_mjx_jp, index); +- ///////////////////////////////////// +- result[index] = jp.x * xp.x + jp.y * xp.y + jp.z * xp.z; +- } +-} +- +-template +-__global__ void multiply_jx_notex2_kernel(int num, int bwidth, int offset, +- float* jcx, float* jpx, +- float* result) { +- int bindex = blockIdx.x * blockDim.x + blockIdx.y * bwidth; +- int index = threadIdx.x + bindex; +- +- //////////////////////////////////////////// +- int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1); +- float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x); +- float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1); +- float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset); +- //////////////////////////////////////////// +- __shared__ float jps[KW * 4]; +- __shared__ float jcs[KW * 8]; +- +- for (int i = threadIdx.x; i < 4 * KW; i += KW) +- jps[i] = jpx[(bindex << 2) + i]; +- for (int i = threadIdx.x; i < 8 * KW; i += KW) +- jcs[i] = jcx[(bindex << 3) + i]; +- +- __syncthreads(); +- if (index >= num) return; +- +- ///////////////////////////////////// +- float *jp = jps + threadIdx.x * 4, *jc = jcs + threadIdx.x * 8; +- result[index] = jc[0] * xc1.x + jc[1] * xc1.y + jc[2] * xc1.z + +- jc[3] * xc1.w + jc[4] * xc2.x + jc[5] * xc2.y + +- jc[6] * xc2.z + jc[7] * xc2.w + jp[0] * xp.x + jp[1] * xp.y + +- jp[2] * xp.z; +-} +- +-template +-__global__ void multiply_jpx_notex2_kernel(int num, int bwidth, int offset, +- float* jpx, float* result) { +- int bindex = blockIdx.x * blockDim.x + blockIdx.y * bwidth; +- int index = threadIdx.x + bindex; +- +- //////////////////////////////////////////// +- int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1); +- float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset); +- //////////////////////////////////////////// +- __shared__ float jps[KW * 4]; +- +- for (int i = threadIdx.x; i < 4 * KW; i += KW) +- jps[i] = jpx[(bindex << 2) + i]; +- +- __syncthreads(); +- if (index >= num) return; +- +- ///////////////////////////////////// +- float* jp = jps + threadIdx.x * 4; +- result[index] = jp[0] * xp.x + jp[1] * xp.y + jp[2] * xp.z; +-} +- +-template +-__global__ void multiply_jcx_notex2_kernel(int num, int bwidth, float* jcx, +- float* result) { +- int bindex = blockIdx.x * blockDim.x + blockIdx.y * bwidth; +- int index = threadIdx.x + bindex; +- +- //////////////////////////////////////////// +- int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1); +- float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x); +- float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1); +- //////////////////////////////////////////// +- +- __shared__ float jcs[KW * 8]; +- for (int i = threadIdx.x; i < 8 * KW; i += KW) +- jcs[i] = jcx[(bindex << 3) + i]; +- +- __syncthreads(); +- if (index >= num) return; +- +- ///////////////////////////////////// +- float* jc = jcs + threadIdx.x * 8; +- result[index] = jc[0] * xc1.x + jc[1] * xc1.y + jc[2] * xc1.z + +- jc[3] * xc1.w + jc[4] * xc2.x + jc[5] * xc2.y + +- jc[6] * xc2.z + jc[7] * xc2.w; +-} +- +-void ProgramCU::ComputeJX(int point_offset, CuTexImage& x, CuTexImage& jc, +- CuTexImage& jp, CuTexImage& jmap, CuTexImage& result, +- int mode) { +- // given a vector of parameters.... +- // multiply the Jacobian Matrix with it [jc jp] * p +- // for each measurment, read back the jacobian +- // multiply and summ up th corresponding +- +- unsigned int nproj = jmap.GetImgWidth(); +- unsigned int len = nproj * 2; +- unsigned int bsize = 64; +- unsigned int nblock = (len + bsize - 1) / bsize; +- unsigned int bw, bh; +- jmap.BindTexture(tex_mjx_idx); +- x.BindTexture(tex_mjx_x); +- +- if (mode == 0) { +- size_t szjc = jc.GetDataSize(); +- if (TEX_TOOBIG4(szjc)) { +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- multiply_jx_notex2_kernel<64><<>>( +- len, (bw * bsize), point_offset, jc.data(), jp.data(), result.data()); +- } else if (szjc > 2 * MAX_TEXSIZE) { +- jp.BindTexture2(tex_mjx_jp, tex_mjx_jp2); +- jc.BindTexture4(tex_mjx_jc, tex_mjx_jc2, tex_mjx_jc3, tex_mjx_jc4); +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- multiply_jx_kernel<4><<>>(len, (bw * bsize), point_offset, +- result.data()); +- } else if (szjc > MAX_TEXSIZE) { +- jp.BindTexture(tex_mjx_jp); +- jc.BindTexture2(tex_mjx_jc, tex_mjx_jc2); +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- multiply_jx_kernel<2><<>>(len, (bw * bsize), point_offset, +- result.data()); +- } else { +- jp.BindTexture(tex_mjx_jp); +- jc.BindTexture(tex_mjx_jc); +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bh, bw), block(bsize); +- multiply_jx_kernel<1><<>>(len, (bh * bsize), point_offset, +- result.data()); +- } +- CheckErrorCUDA("ComputeJX"); +- } else if (mode == 1) { +- size_t szjc = jc.GetDataSize(); +- if (TEX_TOOBIG4(szjc)) { +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- multiply_jcx_notex2_kernel<64><<>>(len, (bw * bsize), +- jc.data(), result.data()); +- } else if (szjc > 2 * MAX_TEXSIZE) { +- jc.BindTexture4(tex_mjx_jc, tex_mjx_jc2, tex_mjx_jc3, tex_mjx_jc4); +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- multiply_jcx_kernel<4><<>>(len, (bw * bsize), result.data()); +- } else if (szjc > MAX_TEXSIZE) { +- jc.BindTexture2(tex_mjx_jc, tex_mjx_jc2); +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- multiply_jcx_kernel<2><<>>(len, (bw * bsize), result.data()); +- } else { +- jc.BindTexture(tex_mjx_jc); +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bh, bw), block(bsize); +- multiply_jcx_kernel<1><<>>(len, (bh * bsize), result.data()); +- } +- CheckErrorCUDA("ComputeJCX"); +- } else if (mode == 2) { +- size_t szjp = jp.GetDataSize(); +- if (szjp > MAX_TEXSIZE) { +- jp.BindTexture(tex_mjx_jp); +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- multiply_jpx_kernel<2><<>>(len, (bw * bsize), point_offset, +- result.data()); +- } else { +- jp.BindTexture(tex_mjx_jp); +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bh, bw), block(bsize); +- multiply_jpx_kernel<1><<>>(len, (bh * bsize), point_offset, +- result.data()); +- } +- CheckErrorCUDA("ComputeJPX"); +- } +-} +- +-template +-__device__ void jacobian_internal(int camera_pos, int pt_pos, int tidx, +- float* r, float jic, float* jxc, float* jyc, +- float* jxp, float* jyp) { +- float m[3]; +- float4 ft = tex1Dfetch(tex_jacobian_cam, camera_pos); +- float4 r1 = tex1Dfetch(tex_jacobian_cam, camera_pos + 1); +- r[0] = r1.x; +- r[1] = r1.y; +- r[2] = r1.z; +- r[3] = r1.w; +- float4 r2 = tex1Dfetch(tex_jacobian_cam, camera_pos + 2); +- r[4] = r2.x; +- r[5] = r2.y; +- r[6] = r2.z; +- r[7] = r2.w; +- float4 r3 = tex1Dfetch(tex_jacobian_cam, camera_pos + 3); +- r[8] = r3.x; +- +- float4 temp = tex1Dfetch(tex_jacobian_pts, pt_pos); +- m[0] = temp.x; +- m[1] = temp.y; +- m[2] = temp.z; +- +- float x0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2]; +- float y0 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2]; +- float z0 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2]; +- float f_p2 = FDIV(ft.x, z0 + ft.w); +- float p0_p2 = FDIV(x0 + ft.y, z0 + ft.w); +- float p1_p2 = FDIV(y0 + ft.z, z0 + ft.w); +- +- if (pd) { +- float rr1 = r3.y * p0_p2 * p0_p2; +- float rr2 = r3.y * p1_p2 * p1_p2; +- float f_p2_x = f_p2 * (1.0 + 3.0 * rr1 + rr2); +- float f_p2_y = f_p2 * (1.0 + 3.0 * rr2 + rr1); +- +- JACOBIAN_SET_JC_BEGIN +- float jfc = jic * (1 + rr1 + rr2); +- float ft_x_pn = jic * ft.x * (p0_p2 * p0_p2 + p1_p2 * p1_p2); +- ///////////////////////////////////////////////////// +- jxc[0] = p0_p2 * jfc; +- jxc[1] = f_p2_x; +- jxc[2] = 0; +- jxc[3] = -f_p2_x * p0_p2; +- jxc[4] = -f_p2_x * p0_p2 * y0; +- jxc[5] = f_p2_x * (z0 + x0 * p0_p2); +- jxc[6] = -f_p2_x * y0; +- jxc[7] = ft_x_pn * p0_p2; +- +- jyc[0] = p1_p2 * jfc; +- jyc[1] = 0; +- jyc[2] = f_p2_y; +- jyc[3] = -f_p2_y * p1_p2; +- jyc[4] = -f_p2_y * (z0 + y0 * p1_p2); +- jyc[5] = f_p2_y * x0 * p1_p2; +- jyc[6] = f_p2_y * x0; +- jyc[7] = ft_x_pn * p1_p2; +- JACOBIAN_SET_JC_END +- /////////////////////////////////// +- jxp[0] = f_p2_x * (r[0] - r[6] * p0_p2); +- jxp[1] = f_p2_x * (r[1] - r[7] * p0_p2); +- jxp[2] = f_p2_x * (r[2] - r[8] * p0_p2); +- jyp[0] = f_p2_y * (r[3] - r[6] * p1_p2); +- jyp[1] = f_p2_y * (r[4] - r[7] * p1_p2); +- jyp[2] = f_p2_y * (r[5] - r[8] * p1_p2); +- } else { +- JACOBIAN_SET_JC_BEGIN +- jxc[0] = p0_p2 * jic; +- jxc[1] = f_p2; +- jxc[2] = 0; +- jxc[3] = -f_p2 * p0_p2; +- jxc[4] = -f_p2 * p0_p2 * y0; +- jxc[5] = f_p2 * (z0 + x0 * p0_p2); +- jxc[6] = -f_p2 * y0; +- +- jyc[0] = p1_p2 * jic; +- jyc[1] = 0; +- jyc[2] = f_p2; +- jyc[3] = -f_p2 * p1_p2; +- jyc[4] = -f_p2 * (z0 + y0 * p1_p2); +- jyc[5] = f_p2 * x0 * p1_p2; +- jyc[6] = f_p2 * x0; +- +- if (md) { +- float2 ms = tex1Dfetch(tex_jacobian_meas, tidx); +- float msn = (ms.x * ms.x + ms.y * ms.y) * jic; +- jxc[7] = -ms.x * msn; +- jyc[7] = -ms.y * msn; +- } else { +- jxc[7] = 0; +- jyc[7] = 0; +- } +- JACOBIAN_SET_JC_END +- /////////////////////////////////// +- jxp[0] = f_p2 * (r[0] - r[6] * p0_p2); +- jxp[1] = f_p2 * (r[1] - r[7] * p0_p2); +- jxp[2] = f_p2 * (r[2] - r[8] * p0_p2); +- jyp[0] = f_p2 * (r[3] - r[6] * p1_p2); +- jyp[1] = f_p2 * (r[4] - r[7] * p1_p2); +- jyp[2] = f_p2 * (r[5] - r[8] * p1_p2); +- } +-} +- +-template +-__device__ void jacobian_camera_internal(int camera_pos, int pt_pos, int tidx, +- float* r, float jic, float* jxc, +- float* jyc) { +- float m[3]; +- float4 ft = tex1Dfetch(tex_jacobian_cam, camera_pos); +- float4 r1 = tex1Dfetch(tex_jacobian_cam, camera_pos + 1); +- r[0] = r1.x; +- r[1] = r1.y; +- r[2] = r1.z; +- r[3] = r1.w; +- float4 r2 = tex1Dfetch(tex_jacobian_cam, camera_pos + 2); +- r[4] = r2.x; +- r[5] = r2.y; +- r[6] = r2.z; +- r[7] = r2.w; +- float4 r3 = tex1Dfetch(tex_jacobian_cam, camera_pos + 3); +- r[8] = r3.x; +- +- float4 temp = tex1Dfetch(tex_jacobian_pts, pt_pos); +- m[0] = temp.x; +- m[1] = temp.y; +- m[2] = temp.z; +- +- float x0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2]; +- float y0 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2]; +- float z0 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2]; +- float f_p2 = FDIV(ft.x, z0 + ft.w); +- float p0_p2 = FDIV(x0 + ft.y, z0 + ft.w); +- float p1_p2 = FDIV(y0 + ft.z, z0 + ft.w); +-#ifndef PBA_DISABLE_CONST_CAMERA +- if (r3.w != 0.0f) { +- jxc[0] = 0; +- jxc[1] = 0; +- jxc[2] = 0; +- jxc[3] = 0; +- jxc[4] = 0; +- jxc[5] = 0; +- jxc[6] = 0; +- jxc[7] = 0; +- jyc[0] = 0; +- jyc[1] = 0; +- jyc[2] = 0; +- jyc[3] = 0; +- jyc[4] = 0; +- jyc[5] = 0; +- jyc[6] = 0; +- jyc[7] = 0; +- } else +-#endif +- if (pd) { +- float rr1 = r3.y * p0_p2 * p0_p2; +- float rr2 = r3.y * p1_p2 * p1_p2; +- float f_p2_x = f_p2 * (1.0 + 3.0 * rr1 + rr2); +- float f_p2_y = f_p2 * (1.0 + 3.0 * rr2 + rr1); +- float jfc = jic * (1 + rr1 + rr2); +- float ft_x_pn = jic * ft.x * (p0_p2 * p0_p2 + p1_p2 * p1_p2); +- ///////////////////////////////////////////////////// +- jxc[0] = p0_p2 * jfc; +- jxc[1] = f_p2_x; +- jxc[2] = 0; +- jxc[3] = -f_p2_x * p0_p2; +- jxc[4] = -f_p2_x * p0_p2 * y0; +- jxc[5] = f_p2_x * (z0 + x0 * p0_p2); +- jxc[6] = -f_p2_x * y0; +- jxc[7] = ft_x_pn * p0_p2; +- +- jyc[0] = p1_p2 * jfc; +- jyc[1] = 0; +- jyc[2] = f_p2_y; +- jyc[3] = -f_p2_y * p1_p2; +- jyc[4] = -f_p2_y * (z0 + y0 * p1_p2); +- jyc[5] = f_p2_y * x0 * p1_p2; +- jyc[6] = f_p2_y * x0; +- jyc[7] = ft_x_pn * p1_p2; +- } else { +- jxc[0] = p0_p2 * jic; +- jxc[1] = f_p2; +- jxc[2] = 0; +- jxc[3] = -f_p2 * p0_p2; +- jxc[4] = -f_p2 * p0_p2 * y0; +- jxc[5] = f_p2 * (z0 + x0 * p0_p2); +- jxc[6] = -f_p2 * y0; +- +- jyc[0] = p1_p2 * jic; +- jyc[1] = 0; +- jyc[2] = f_p2; +- jyc[3] = -f_p2 * p1_p2; +- jyc[4] = -f_p2 * (z0 + y0 * p1_p2); +- jyc[5] = f_p2 * x0 * p1_p2; +- jyc[6] = f_p2 * x0; +- +- if (md) { +- float2 ms = tex1Dfetch(tex_jacobian_meas, tidx); +- float msn = (ms.x * ms.x + ms.y * ms.y) * jic; +- jxc[7] = -ms.x * msn; +- jyc[7] = -ms.y * msn; +- } else { +- jxc[7] = 0; +- jyc[7] = 0; +- } +- } +-} +- +-template +-__device__ void jacobian_point_internal(int camera_pos, int pt_pos, int tidx, +- float* r, float* jxp, float* jyp) { +- float m[3]; +- float4 ft = tex1Dfetch(tex_jacobian_cam, camera_pos); +- float4 r1 = tex1Dfetch(tex_jacobian_cam, camera_pos + 1); +- r[0] = r1.x; +- r[1] = r1.y; +- r[2] = r1.z; +- r[3] = r1.w; +- float4 r2 = tex1Dfetch(tex_jacobian_cam, camera_pos + 2); +- r[4] = r2.x; +- r[5] = r2.y; +- r[6] = r2.z; +- r[7] = r2.w; +- float4 r3 = tex1Dfetch(tex_jacobian_cam, camera_pos + 3); +- r[8] = r3.x; +- +- float4 temp = tex1Dfetch(tex_jacobian_pts, pt_pos); +- m[0] = temp.x; +- m[1] = temp.y; +- m[2] = temp.z; +- +- float x0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2]; +- float y0 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2]; +- float z0 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2]; +- float f_p2 = FDIV(ft.x, z0 + ft.w); +- float p0_p2 = FDIV(x0 + ft.y, z0 + ft.w); +- float p1_p2 = FDIV(y0 + ft.z, z0 + ft.w); +- +- if (pd) { +- float rr1 = r3.y * p0_p2 * p0_p2; +- float rr2 = r3.y * p1_p2 * p1_p2; +- float f_p2_x = f_p2 * (1.0 + 3.0 * rr1 + rr2); +- float f_p2_y = f_p2 * (1.0 + 3.0 * rr2 + rr1); +- /////////////////////////////////// +- jxp[0] = f_p2_x * (r[0] - r[6] * p0_p2); +- jxp[1] = f_p2_x * (r[1] - r[7] * p0_p2); +- jxp[2] = f_p2_x * (r[2] - r[8] * p0_p2); +- jyp[0] = f_p2_y * (r[3] - r[6] * p1_p2); +- jyp[1] = f_p2_y * (r[4] - r[7] * p1_p2); +- jyp[2] = f_p2_y * (r[5] - r[8] * p1_p2); +- } else { +- /////////////////////////////////// +- jxp[0] = f_p2 * (r[0] - r[6] * p0_p2); +- jxp[1] = f_p2 * (r[1] - r[7] * p0_p2); +- jxp[2] = f_p2 * (r[2] - r[8] * p0_p2); +- jyp[0] = f_p2 * (r[3] - r[6] * p1_p2); +- jyp[1] = f_p2 * (r[4] - r[7] * p1_p2); +- jyp[2] = f_p2 * (r[5] - r[8] * p1_p2); +- } +-} +- +-template +-__global__ void multiply_jx_noj_kernel(int num, int bwidth, int offset, +- float jic, float2* result) { +- int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth; +- if (index >= num) return; +- +- __shared__ float data[9 * 64]; +- //////////////////////////////////////////// +- int2 proj = tex1Dfetch(tex_mjx_idx, index); +- float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x); +- float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1); +- float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset); +- +- //////////////////////////////////////////// +- float jxc[8], jyc[8], jxp[3], jyp[3]; +- jacobian_internal(proj.x << 1, proj.y, index, data + 9 * threadIdx.x, +- jic, jxc, jyc, jxp, jyp); +- +- ///////////////////////////////////// +- result[index] = make_float2( +- jxc[0] * xc1.x + jxc[1] * xc1.y + jxc[2] * xc1.z + jxc[3] * xc1.w + +- jxc[4] * xc2.x + jxc[5] * xc2.y + jxc[6] * xc2.z + jxc[7] * xc2.w + +- jxp[0] * xp.x + jxp[1] * xp.y + jxp[2] * xp.z, +- jyc[0] * xc1.x + jyc[1] * xc1.y + jyc[2] * xc1.z + jyc[3] * xc1.w + +- jyc[4] * xc2.x + jyc[5] * xc2.y + jyc[6] * xc2.z + jyc[7] * xc2.w + +- jyp[0] * xp.x + jyp[1] * xp.y + jyp[2] * xp.z); +-} +- +-template +-__global__ void multiply_jcx_noj_kernel(int num, int bwidth, float jic, +- float2* result) { +- int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth; +- if (index >= num) return; +- +- __shared__ float data[9 * 64]; +- //////////////////////////////////////////// +- int2 proj = tex1Dfetch(tex_mjx_idx, index); +- float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x); +- float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1); +- +- //////////////////////////////////////////// +- float jxc[8], jyc[8]; +- jacobian_camera_internal(proj.x << 1, proj.y, index, +- data + 9 * threadIdx.x, jic, jxc, jyc); +- +- ///////////////////////////////////// +- result[index] = make_float2( +- jxc[0] * xc1.x + jxc[1] * xc1.y + jxc[2] * xc1.z + jxc[3] * xc1.w + +- jxc[4] * xc2.x + jxc[5] * xc2.y + jxc[6] * xc2.z + jxc[7] * xc2.w, +- jyc[0] * xc1.x + jyc[1] * xc1.y + jyc[2] * xc1.z + jyc[3] * xc1.w + +- jyc[4] * xc2.x + jyc[5] * xc2.y + jyc[6] * xc2.z + jyc[7] * xc2.w); +-} +- +-template +-__global__ void multiply_jpx_noj_kernel(int num, int bwidth, int offset, +- float2* result) { +- int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth; +- if (index >= num) return; +- +- __shared__ float data[9 * 64]; +- //////////////////////////////////////////// +- int2 proj = tex1Dfetch(tex_mjx_idx, index); +- float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset); +- +- //////////////////////////////////////////// +- float jxp[3], jyp[3]; +- jacobian_point_internal(proj.x << 1, proj.y, index, +- data + 9 * threadIdx.x, jxp, jyp); +- +- ///////////////////////////////////// +- result[index] = make_float2(jxp[0] * xp.x + jxp[1] * xp.y + jxp[2] * xp.z, +- jyp[0] * xp.x + jyp[1] * xp.y + jyp[2] * xp.z); +-} +- +-void ProgramCU::ComputeJX_(CuTexImage& x, CuTexImage& jx, CuTexImage& camera, +- CuTexImage& point, CuTexImage& meas, +- CuTexImage& pjmap, bool intrinsic_fixed, +- int radial_distortion, int mode) { +- unsigned int nproj = pjmap.GetImgWidth(); +- unsigned int len = nproj; +- unsigned int bsize = 64; +- unsigned int nblock = (len + bsize - 1) / bsize; +- unsigned int bw, bh; +- int point_offset = camera.GetImgWidth() * 2; +- float jfc = intrinsic_fixed ? 0 : 1.0f; +- +- ///////////////////////////// +- pjmap.BindTexture(tex_mjx_idx); +- x.BindTexture(tex_mjx_x); +- camera.BindTexture(tex_jacobian_cam); +- point.BindTexture(tex_jacobian_pts); +- +- /////////////////////////////////// +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- +- if (mode == 0) { +- if (radial_distortion == -1) { +- meas.BindTexture(tex_jacobian_meas); +- multiply_jx_noj_kernel<<>>( +- len, (bw * bsize), point_offset, jfc, (float2*)jx.data()); +- } else if (radial_distortion) { +- multiply_jx_noj_kernel<<>>( +- len, (bw * bsize), point_offset, jfc, (float2*)jx.data()); +- } else { +- multiply_jx_noj_kernel<<>>( +- len, (bw * bsize), point_offset, jfc, (float2*)jx.data()); +- } +- +- CheckErrorCUDA("ComputeJX_"); +- } else if (mode == 1) { +- if (radial_distortion == -1) { +- meas.BindTexture(tex_jacobian_meas); +- multiply_jcx_noj_kernel<<>>( +- len, (bw * bsize), jfc, (float2*)jx.data()); +- } else if (radial_distortion) { +- multiply_jcx_noj_kernel<<>>( +- len, (bw * bsize), jfc, (float2*)jx.data()); +- } else { +- multiply_jcx_noj_kernel<<>>( +- len, (bw * bsize), jfc, (float2*)jx.data()); +- } +- +- CheckErrorCUDA("ComputeJCX_"); +- } else if (mode == 2) { +- if (radial_distortion == 1) { +- multiply_jpx_noj_kernel<<>>( +- len, (bw * bsize), point_offset, (float2*)jx.data()); +- } else { +- multiply_jpx_noj_kernel<<>>( +- len, (bw * bsize), point_offset, (float2*)jx.data()); +- } +- +- CheckErrorCUDA("ComputeJX_"); +- } +-} +- +-template +-__global__ void jte_cam_vec_noj_kernel(int num, int rowsz, float jic, +- float* jte) { +- __shared__ float value[KH * 32 * 9]; // 8 * KH * 32 +- int cam = blockIdx.x * KH + threadIdx.y + blockIdx.y * rowsz; +- if (cam >= num) return; +- +- // read data range for this camera +- // 8 thread will do the same thing +- int idx1 = tex1Dfetch(tex_jte_cmp, cam); // first camera +- int idx2 = tex1Dfetch(tex_jte_cmp, cam + 1); // last camera + 1 +- +- float* valuec = value + 32 * 9 * threadIdx.y; +- float* rp = valuec + threadIdx.x * 9; +- float rr[8], jxc[8], jyc[8]; +- for (int i = 0; i < 8; ++i) rr[i] = 0; +- +- // loop to read the index of the projection. +- // so to get the location to read the jacobian +- for (int i = idx1 + threadIdx.x; i < idx2; i += 32) { +- int index = tex1Dfetch(tex_jte_cmt, i); +- int2 proj = tex1Dfetch(tex_jacobian_idx, index); +- jacobian_camera_internal(cam << 2, proj.y, index, rp, jic, jxc, +- jyc); +- float2 vv = tex1Dfetch(tex_jte_pe, index); +- // +- for (int j = 0; j < 8; ++j) rr[j] += (jxc[j] * vv.x + jyc[j] * vv.y); +- } +- +- float* valuei = valuec + 8 * threadIdx.x; +- for (int i = 0; i < 8; ++i) valuei[i] = rr[i]; +- valuec[threadIdx.x] = (valuec[threadIdx.x] + valuec[threadIdx.x + 32] + +- valuec[threadIdx.x + 64] + valuec[threadIdx.x + 96] + +- valuec[threadIdx.x + 128] + valuec[threadIdx.x + 160] + +- valuec[threadIdx.x + 192] + valuec[threadIdx.x + 224]); +- if (threadIdx.x < 16) valuec[threadIdx.x] += valuec[threadIdx.x + 16]; +- if (threadIdx.x < 8) +- valuec[threadIdx.x] = valuec[threadIdx.x] + valuec[threadIdx.x + 8]; +- +- //////////////////////////////////// +- if (threadIdx.x < 8) jte[(cam << 3) + threadIdx.x] = valuec[threadIdx.x]; +-} +- +-template +-__global__ void jte_point_vec_noj_kernel(int num, int rowsz, float* jte) { +- //////////////////////////// +- __shared__ float value[KH * (9 * 32)]; +- int index = blockIdx.x * KH + threadIdx.y + blockIdx.y * rowsz; +- if (index >= num) return; +- +- int idx1 = tex1Dfetch(tex_jte_pmp, index); // first +- int idx2 = tex1Dfetch(tex_jte_pmp, index + 1); // last + 1 +- float rx = 0, ry = 0, rz = 0, jxp[3], jyp[3]; +- int rowp = threadIdx.y * 9 * 32; +- float* rp = value + threadIdx.x * 9 + rowp; +- for (int i = idx1 + threadIdx.x; i < idx2; i += 32) { +- float2 ev = tex1Dfetch(tex_jte_pe, i); +- int2 proj = tex1Dfetch(tex_jacobian_idx, i); +- jacobian_point_internal(proj.x << 1, proj.y, i, rp, jxp, jyp); +- rx += (jxp[0] * ev.x + jyp[0] * ev.y); +- ry += (jxp[1] * ev.x + jyp[1] * ev.y); +- rz += (jxp[2] * ev.x + jyp[2] * ev.y); +- } +- +- int loc = (threadIdx.x << 2) + rowp; +- value[loc] = rx; +- value[loc + 1] = ry; +- value[loc + 2] = rz; +- value[loc + 3] = 0; +- +- int ridx = threadIdx.x + rowp; +- value[ridx] = ((value[ridx] + value[ridx + 32]) + +- (value[ridx + 64] + value[ridx + 96])); +- if (threadIdx.x < 16) value[ridx] += value[ridx + 16]; +- if (threadIdx.x < 8) value[ridx] += value[ridx + 8]; +- if (threadIdx.x < 4) +- jte[(index << 2) + threadIdx.x] = value[ridx] + value[ridx + 4]; +-} +- +-void ProgramCU::ComputeJtE_(CuTexImage& e, CuTexImage& jte, CuTexImage& camera, +- CuTexImage& point, CuTexImage& meas, +- CuTexImage& cmap, CuTexImage& cmlist, +- CuTexImage& pmap, CuTexImage& pjmap, CuTexImage& jp, +- bool intrinsic_fixed, int radial_distortion, +- int mode) { +- pjmap.BindTexture(tex_jacobian_idx); +- camera.BindTexture(tex_jacobian_cam); +- point.BindTexture(tex_jacobian_pts); +- if (radial_distortion) meas.BindTexture(tex_jacobian_meas); +- +- cmap.BindTexture(tex_jte_cmp); +- cmlist.BindTexture(tex_jte_cmt); +- e.BindTexture(tex_jte_pe); +- +- // +- unsigned int bw, bh; +- float jfc = intrinsic_fixed ? 0 : 1.0f; +- int ncam = camera.GetImgWidth(); +- const int bheight1 = 2, bsize = 32; +- int nblock1 = (ncam + bheight1 - 1) / bheight1; +- GetBlockConfiguration(nblock1, bw, bh); +- dim3 grid(bw, bh), block(bsize, bheight1); +- if (mode == 2) { +- } else if (radial_distortion == -1) +- jte_cam_vec_noj_kernel<<>>( +- ncam, bw * bheight1, jfc, jte.data()); +- else if (radial_distortion) +- jte_cam_vec_noj_kernel<<>>( +- ncam, bw * bheight1, jfc, jte.data()); +- else +- jte_cam_vec_noj_kernel<<>>( +- ncam, bw * bheight1, jfc, jte.data()); +- CheckErrorCUDA("ComputeJtE_"); +- +- int npt = point.GetImgWidth(); +- unsigned int offsetv = 8 * ncam; +- const int bheight2 = 2, bsize2 = 32; +- int nblock2 = (npt + bheight2 - 1) / bheight2; +- GetBlockConfiguration(nblock2, bw, bh); +- dim3 grid2(bw, bh), block2(bsize2, bheight2); +- if (mode == 1) { +- } else if (jp.IsValid()) { +- pmap.BindTexture(tex_jte_pmp); +- e.BindTexture(tex_jte_pex); +- jp.BindTexture2(tex_jte_jp, tex_jte_jp2); +- if (jp.GetDataSize() > MAX_TEXSIZE) +- jte_point_vec_kernel<<>>( +- npt, bw * bheight2, jte.data() + offsetv); +- else +- jte_point_vec_kernel<<>>( +- npt, bw * bheight2, jte.data() + offsetv); +- } else { +- pmap.BindTexture(tex_jte_pmp); +- if (radial_distortion && radial_distortion != -1) +- jte_point_vec_noj_kernel<<>>( +- npt, bw * bheight2, jte.data() + offsetv); +- else +- jte_point_vec_noj_kernel<<>>( +- npt, bw * bheight2, jte.data() + offsetv); +- } +- CheckErrorCUDA("ComputeJtE_"); +-} +- +-template +-__global__ void jtjd_cam_block_noj_kernel(int num, int rowsz, float lambda1, +- float lambda2, float jic, float* diag, +- float* blocks, +- bool add_existing_diagc) { +- const int VN = (md || pd) ? 8 : 7; +- __shared__ float buffer_all[32 * 9 * KH]; +- __shared__ float value_all[64 * KH]; +- +- // 8thread per camera +- int bcam = blockIdx.x * KH + blockIdx.y * rowsz; +- +- int cam = bcam + threadIdx.y; +- if (cam >= num) return; +- +- float* buffer = buffer_all + threadIdx.y * (32 * 9); +- float* value = value_all + threadIdx.y * 64; +- +- float jxc[8], jyc[8]; +- float* rp = buffer + threadIdx.x * 9; +- float row0[VN], row1[VN - 1], row2[VN - 2], row3[VN - 3]; +- float row4[VN - 4], row5[VN - 5], row6[VN - 6], row7[1] = {0}; +- // read data range for this camera +- // 8 thread will do the same thing +- int idx1 = tex1Dfetch(tex_jtjd_cmp, cam); // first camera +- int idx2 = tex1Dfetch(tex_jtjd_cmp, cam + 1); // last camera + 1 +- +-#define REPEAT7(FUNC) \ +- FUNC(0); \ +- FUNC(1); \ +- FUNC(2); \ +- FUNC(3); \ +- FUNC(4); \ +- FUNC(5); \ +- FUNC(6); +-#define SETZERO(k) \ +- for (int j = 0; j < VN - k; ++j) row##k[j] = 0; +- REPEAT7(SETZERO); +- +- float4 sjv[2]; +- if (scaling && (pd || md)) { +- sjv[0] = tex1Dfetch(tex_jacobian_sj, (cam << 1)); +- sjv[1] = tex1Dfetch(tex_jacobian_sj, (cam << 1) + 1); +- } +- +- // loop to read the index of the projection. +- // so to get the location to read the jacobian +- for (int i = idx1 + threadIdx.x; i < idx2; i += 32) { +- ///////////////////////////////////////// +- int index = tex1Dfetch(tex_jtjd_cmlist, i); +- int2 proj = tex1Dfetch(tex_jacobian_idx, index); +- +- /////////////////////////////////////////////// +- jacobian_camera_internal(cam << 2, proj.y, index, rp, jic, jxc, +- jyc); +- +- if (scaling && (pd || md)) { +- float* sj = (float*)sjv; // 32 threads...64 values +- for (int j = 0; j < VN; ++j) { +- jxc[j] *= sj[j]; +- jyc[j] *= sj[j]; +- } +- } +- +-//////////////////////////////////////////////// +-#define ADDROW(k) \ +- for (int j = k; j < VN; ++j) \ +- row##k[j - k] += (jxc[k] * jxc[j] + jyc[k] * jyc[j]) +- +- /////////////// +- REPEAT7(ADDROW); +- if (VN == 8) { +- ADDROW(7); +- } +- } +- +-//////////////////////////////////// +-// make the matrix..//add up the 32 * 8 matrix +-#define JTJDSUM8_V1() \ +- buffer[threadIdx.x] = \ +- (buffer[threadIdx.x] + buffer[threadIdx.x + 32] + \ +- buffer[threadIdx.x + 64] + buffer[threadIdx.x + 96] + \ +- buffer[threadIdx.x + 128] + buffer[threadIdx.x + 160] + \ +- buffer[threadIdx.x + 192] + buffer[threadIdx.x + 224]); +- +-#define JTJDSUM8_V2() \ +- buffer[threadIdx.x] = \ +- (((buffer[threadIdx.x] + buffer[threadIdx.x + 128]) + \ +- (buffer[threadIdx.x + 64] + buffer[threadIdx.x + 192])) + \ +- ((buffer[threadIdx.x + 32] + buffer[threadIdx.x + 160]) + \ +- (buffer[threadIdx.x + 96] + buffer[threadIdx.x + 224]))); +- +-#define STORE_ROWS(k) \ +- for (int i = 0; i < (VN - k); ++i) bufi[i] = row##k[i]; \ +- JTJDSUM8_V2(); \ +- if (threadIdx.x < 16 - k) buffer[threadIdx.x] += buffer[threadIdx.x + 16]; \ +- if (threadIdx.x < 8 - k) \ +- value[threadIdx.x + k * 9] = buffer[threadIdx.x] + buffer[threadIdx.x + 8]; +- +- float* bufi = buffer + threadIdx.x * 8; +- REPEAT7(STORE_ROWS); +- if (VN == 8) { +- STORE_ROWS(7); +- } +- +- ///////////////////////////////////////////////////////////////////////////////////////////// +- +- //////////////////////////////// (8 * i + j) -> (8 * j + i) +- //#define COPYSYM(i) if(threadIdx.x < VN - i - 1) value[threadIdx.x * 8 + i * +- //9 + 8] = value[threadIdx.x + i * 9 + 1]; +- if (threadIdx.x < VN - 1) value[threadIdx.x * 8 + 8] = value[threadIdx.x + 1]; +- if (threadIdx.x < VN - 2) +- value[threadIdx.x * 8 + 17] = value[threadIdx.x + 10]; +- if (threadIdx.x < VN - 3) +- value[threadIdx.x * 8 + 26] = value[threadIdx.x + 19]; +- if (threadIdx.x < VN - 4) +- value[threadIdx.x * 8 + 35] = value[threadIdx.x + 28]; +- if (threadIdx.x < VN - 5) +- value[threadIdx.x * 8 + 44] = value[threadIdx.x + 37]; +- if (threadIdx.x < VN - 6) +- value[threadIdx.x * 8 + 53] = value[threadIdx.x + 46]; +- if (VN == 8 && threadIdx.x < VN - 7) +- value[threadIdx.x * 8 + 62] = value[threadIdx.x + 55]; +- +- if (scaling && !pd && !md) { +- float4 sjv[2]; +- float* sj = (float*)sjv; // 32 threads...64 values +- sjv[0] = tex1Dfetch(tex_jacobian_sj, (cam << 1)); +- sjv[1] = tex1Dfetch(tex_jacobian_sj, (cam << 1) + 1); +- float sji = sj[threadIdx.x & 0x07]; +- value[threadIdx.x] *= (sji * sj[threadIdx.x / 8]); +- value[threadIdx.x + 32] *= (sji * sj[4 + threadIdx.x / 8]); +- } +- +- bool zero = ((threadIdx.x & 0x7) == VN); +- +- ///////////write back +- if (threadIdx.x < 8) { +- float* dp = value + threadIdx.x * 9; +- float temp = zero ? 0 : dp[0]; +- int didx = threadIdx.x + (cam << 3); +- if (add_existing_diagc) temp += diag[didx]; +- diag[didx] = temp; +- dp[0] = lambda1 + lambda2 * temp; +- } +- int wpos = cam * (8 * VN) + threadIdx.x; +- blocks[wpos] = zero ? 0 : value[threadIdx.x]; +- if (threadIdx.x < VN * 8 - 32) +- blocks[wpos + 32] = zero ? 0 : value[threadIdx.x + 32]; +-} +- +-template +-__global__ void jtjd_point_block_noj_kernel(int num, int rowsz, float lambda1, +- float lambda2, float4* diag, +- float4* blocks, int ptx) { +- //////////////////////////// +- int index = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz; +- if (index >= num) return; +- +- __shared__ float value[KW * 9]; +- int idx1 = tex1Dfetch(tex_jtjd_pmp, index); // first +- int idx2 = tex1Dfetch(tex_jtjd_pmp, index + 1); // last + 1 +- +- float M00 = 0, M01 = 0, M02 = 0, M11 = 0, M12 = 0, M22 = 0; +- float jxp[3], jyp[3]; +- float* rp = value + threadIdx.x * 9; +- +- float4 sj; +- if (scaling && pd) sj = tex1Dfetch(tex_jacobian_sj, index + ptx); +- +- for (int i = idx1; i < idx2; ++i) { +- int2 proj = tex1Dfetch(tex_jacobian_idx, i); +- jacobian_point_internal(proj.x << 1, proj.y, i, rp, jxp, jyp); +- +- if (scaling && pd) { +- jxp[0] *= sj.x; +- jxp[1] *= sj.y; +- jxp[2] *= sj.z; +- jyp[0] *= sj.x; +- jyp[1] *= sj.y; +- jyp[2] *= sj.z; +- } +- M00 += (jxp[0] * jxp[0] + jyp[0] * jyp[0]); +- M01 += (jxp[0] * jxp[1] + jyp[0] * jyp[1]); +- M02 += (jxp[0] * jxp[2] + jyp[0] * jyp[2]); +- M11 += (jxp[1] * jxp[1] + jyp[1] * jyp[1]); +- M12 += (jxp[1] * jxp[2] + jyp[1] * jyp[2]); +- M22 += (jxp[2] * jxp[2] + jyp[2] * jyp[2]); +- } +- +- if (scaling && !pd) { +- sj = tex1Dfetch(tex_jacobian_sj, index + ptx); +- M00 *= (sj.x * sj.x); +- M01 *= (sj.x * sj.y); +- M02 *= (sj.x * sj.z); +- M11 *= (sj.y * sj.y); +- M12 *= (sj.y * sj.z); +- M22 *= (sj.z * sj.z); +- } +- +- diag[index] = make_float4(M00, M11, M22, 0); +- +- M00 = lambda2 * M00 + lambda1; +- M11 = lambda2 * M11 + lambda1; +- M22 = lambda2 * M22 + lambda1; +- +- // invert the 3x3 matrix. +- float det = (M00 * M11 - M01 * M01) * M22 + 2.0 * M01 * M12 * M02 - +- M02 * M02 * M11 - M12 * M12 * M00; +- if (det >= FLT_MAX || det <= FLT_MIN * 2.0f) { +- int write_pos = index * 3; +- blocks[write_pos] = make_float4(0, 0, 0, 0); +- blocks[write_pos + 1] = make_float4(0, 0, 0, 0); +- blocks[write_pos + 2] = make_float4(0, 0, 0, 0); +- } else { +- float m00 = (M11 * M22 - M12 * M12) / det; +- float m01 = -(M01 * M22 - M12 * M02) / det; +- float m02 = (M01 * M12 - M02 * M11) / det; +- int write_pos = index * 3; +- blocks[write_pos] = make_float4(m00, m01, m02, 0); +- +- float m11 = (M00 * M22 - M02 * M02) / det; +- float m12 = -(M00 * M12 - M01 * M02) / det; +- blocks[write_pos + 1] = make_float4(m01, m11, m12, 0); +- +- float m22 = (M00 * M11 - M01 * M01) / det; +- blocks[write_pos + 2] = make_float4(m02, m12, m22, 0); +- } +-} +- +-void ProgramCU::ComputeDiagonalBlock_( +- float lambda, bool dampd, CuTexImage& camera, CuTexImage& point, +- CuTexImage& meas, CuTexImage& cmap, CuTexImage& cmlist, CuTexImage& pmap, +- CuTexImage& jmap, CuTexImage& jp, CuTexImage& sj, CuTexImage& diag, +- CuTexImage& blocks, bool intrinsic_fixed, int radial_distortion, +- bool add_existing_diagc, int mode) { +- float lambda1 = dampd ? 0.0f : lambda; +- float lambda2 = dampd ? (1.0f + lambda) : 1.0f; +- float jfc = intrinsic_fixed ? 0.0f : 1.0f; +- +- ////////////////////////////////// +- jmap.BindTexture(tex_jacobian_idx); +- camera.BindTexture(tex_jacobian_cam); +- point.BindTexture(tex_jacobian_pts); +- cmap.BindTexture(tex_jtjd_cmp); +- cmlist.BindTexture(tex_jtjd_cmlist); +- +- //////////////////////////////////////////////////// +- const unsigned int bsize1 = 32; +- const unsigned int bheight1 = 2; +- unsigned int ncam = camera.GetImgWidth(); // how many cameras +- unsigned int nblock = (ncam + bheight1 - 1) / bheight1; +- unsigned int bw, bh; +- GetBlockConfiguration(nblock, bw, bh); +- dim3 block1(bsize1, bheight1), grid1(bw, bh); +- +- /////////////////////////////////////////////////// +- if (radial_distortion == -1) meas.BindTexture(tex_jacobian_meas); +- if (mode == 2) { +- // skip the camera part. +- } else if (sj.IsValid()) { +- sj.BindTexture(tex_jacobian_sj); +- if (radial_distortion == -1) +- jtjd_cam_block_noj_kernel<<>>( +- ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(), +- blocks.data(), add_existing_diagc); +- else if (radial_distortion) +- jtjd_cam_block_noj_kernel<<>>( +- ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(), +- blocks.data(), add_existing_diagc); +- else +- jtjd_cam_block_noj_kernel<<>>( +- ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(), +- blocks.data(), add_existing_diagc); +- } else { +- if (radial_distortion == -1) +- jtjd_cam_block_noj_kernel<<>>( +- ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(), +- blocks.data(), add_existing_diagc); +- else if (radial_distortion) +- jtjd_cam_block_noj_kernel<<>>( +- ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(), +- blocks.data(), add_existing_diagc); +- else +- jtjd_cam_block_noj_kernel<<>>( +- ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(), +- blocks.data(), add_existing_diagc); +- } +- CheckErrorCUDA("ComputeDiagonalBlock_"); +- +- //////////////////////////////////////////////////// +- const unsigned int bsize2 = 64; +- unsigned int npoint = point.GetImgWidth(); +- unsigned int len2 = npoint; +- unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2; +- unsigned int offsetd = 2 * ncam; +- unsigned int offsetb = (radial_distortion ? 16 : 14) * ncam; +- GetBlockConfiguration(nblock2, bw, bh); +- dim3 grid2(bw, bh), block2(bsize2); +- pmap.BindTexture(tex_jtjd_pmp); +- +- if (mode == 1) { +- } else if (jp.IsValid()) { +- jp.BindTexture2(tex_jtjd_jp, tex_jtjd_jp2); +- if (jp.GetDataSize() > MAX_TEXSIZE) +- jtjd_point_block_kernel<2><<>>( +- len2, (bw * bsize2), lambda1, lambda2, +- ((float4*)diag.data()) + offsetd, ((float4*)blocks.data()) + offsetb); +- else +- jtjd_point_block_kernel<1><<>>( +- len2, (bw * bsize2), lambda1, lambda2, +- ((float4*)diag.data()) + offsetd, ((float4*)blocks.data()) + offsetb); +- } else { +- if (sj.IsValid()) { +- sj.BindTexture(tex_jacobian_sj); +- if (radial_distortion && radial_distortion != -1) +- jtjd_point_block_noj_kernel<<>>( +- len2, (bw * bsize2), lambda1, lambda2, +- ((float4*)diag.data()) + offsetd, +- ((float4*)blocks.data()) + offsetb, offsetd); +- else +- jtjd_point_block_noj_kernel<<>>( +- len2, (bw * bsize2), lambda1, lambda2, +- ((float4*)diag.data()) + offsetd, +- ((float4*)blocks.data()) + offsetb, offsetd); +- } else { +- if (radial_distortion && radial_distortion != -1) +- jtjd_point_block_noj_kernel<<>>( +- len2, (bw * bsize2), lambda1, lambda2, +- ((float4*)diag.data()) + offsetd, +- ((float4*)blocks.data()) + offsetb, 0); +- else +- jtjd_point_block_noj_kernel<<>>( +- len2, (bw * bsize2), lambda1, lambda2, +- ((float4*)diag.data()) + offsetd, +- ((float4*)blocks.data()) + offsetb, 0); +- } +- } +- CheckErrorCUDA("ComputeDiagonalBlock_"); +- +- //////////////////////////////////////////////////// +- if (mode != 2) { +- const unsigned int bsize3 = JTJD_BLOCK_CAM_INVERT_KWIDTH; +- unsigned int len3 = ncam * 8; +- unsigned int nblock3 = (len3 + bsize3 - 1) / bsize3; +- dim3 grid3(nblock3), block3(bsize3); +- if (radial_distortion) +- jtjd_cam_block_invert_kernel<8><<>>( +- len3, (float4*)blocks.data()); +- else +- jtjd_cam_block_invert_kernel<7><<>>( +- len3, (float4*)blocks.data()); +- CheckErrorCUDA("ComputeDiagonalBlockInverse"); +- } +-} +- +-__global__ void projection_q_kernel(int nproj, int rowsz, float2* pj) { +- //////////////////////////////// +- int tidx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz; +- if (tidx >= nproj) return; +- int2 proj = tex1Dfetch(tex_projection_idx, tidx); +- float2 wq = tex1Dfetch(tex_projection_mea, tidx); +- /////////////////////////////////// +- float f1 = tex1Dfetch(tex_projection_cam, proj.x * 4).x; +- float r1 = tex1Dfetch(tex_projection_cam, proj.x * 4 + 3).w; +- float f2 = tex1Dfetch(tex_projection_cam, proj.y * 4).x; +- float r2 = tex1Dfetch(tex_projection_cam, proj.y * 4 + 3).w; +- pj[tidx] = make_float2(-wq.x * (f1 - f2), -wq.y * (r1 - r2)); +-} +- +-void ProgramCU::ComputeProjectionQ(CuTexImage& camera, CuTexImage& qmap, +- CuTexImage& qw, CuTexImage& proj, +- int offset) { +- /////////////////////////////////////// +- unsigned int len = qmap.GetImgWidth(); +- unsigned int bsize = PROJECTION_FRT_KWIDTH; +- unsigned int nblock = (len + bsize - 1) / bsize; +- unsigned int bw, bh; +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- +- /////////////////////////////////////////// +- camera.BindTexture(tex_projection_cam); +- qmap.BindTexture(tex_projection_idx); +- qw.BindTexture(tex_projection_mea); +- +- ////////////////////////////// +- projection_q_kernel<<>>(len, bw * bsize, +- ((float2*)proj.data()) + offset); +-} +- +-template +-__global__ void multiply_jqx_kernel(int num, int bwidth, float2* result) { +- int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth; +- if (index >= num) return; +- //////////////////////////////////////////// +- int2 proj = tex1Dfetch(tex_mjx_idx, index); +- float2 wq = tex1Dfetch(tex_jacobian_meas, index); +- int idx1 = proj.x * 2, idx2 = proj.y * 2; +- float x11 = tex1Dfetch(tex_mjx_x, idx1).x; +- float x17 = tex1Dfetch(tex_mjx_x, idx1 + 1).w; +- float x21 = tex1Dfetch(tex_mjx_x, idx2).x; +- float x27 = tex1Dfetch(tex_mjx_x, idx2 + 1).w; +- +- if (SJ) { +- float s11 = tex1Dfetch(tex_jacobian_sj, idx1).x; +- float s17 = tex1Dfetch(tex_jacobian_sj, idx1 + 1).w; +- float s21 = tex1Dfetch(tex_jacobian_sj, idx2).x; +- float s27 = tex1Dfetch(tex_jacobian_sj, idx2 + 1).w; +- result[index] = make_float2((x11 * s11 - x21 * s21) * wq.x, +- (x17 * s17 - x27 * s27) * wq.y); +- } else { +- result[index] = make_float2((x11 - x21) * wq.x, (x17 - x27) * wq.y); +- } +-} +- +-void ProgramCU::ComputeJQX(CuTexImage& x, CuTexImage& qmap, CuTexImage& wq, +- CuTexImage& sj, CuTexImage& jx, int offset) { +- unsigned int nproj = qmap.GetImgWidth(); +- unsigned int len = nproj; +- unsigned int bsize = 64; +- unsigned int nblock = (len + bsize - 1) / bsize; +- unsigned int bw, bh; +- +- ///////////////////////////// +- qmap.BindTexture(tex_mjx_idx); +- x.BindTexture(tex_mjx_x); +- wq.BindTexture(tex_jacobian_meas); +- +- /////////////////////////////////// +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- +- if (sj.IsValid()) { +- sj.BindTexture(tex_jacobian_sj); +- multiply_jqx_kernel<<>>(len, (bw * bsize), +- ((float2*)jx.data()) + offset); +- } else { +- multiply_jqx_kernel<<>>(len, (bw * bsize), +- ((float2*)jx.data()) + offset); +- } +-} +- +-texture tex_jte_q_idx; +-texture tex_jte_q_w; +- +-template +-__global__ void jte_cam_q_kernel(int num, int bwidth, float* jte) { +- // int cam = blockIdx.x * KH + threadIdx.y + blockIdx.y * rowsz ; +- int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth; +- if (index >= num) return; +- int2 indexp = tex1Dfetch(tex_jte_q_idx, index); +- if (indexp.x == -1) return; +- float2 wq = tex1Dfetch(tex_jte_q_w, index); +- float2 e1 = tex1Dfetch(tex_jte_pe, indexp.x); +- float2 e2 = tex1Dfetch(tex_jte_pe, indexp.y); +- int index8 = index << 3; +- if (SJ) { +- float s1 = tex1Dfetch(tex_jacobian_sj, index * 2).x; +- jte[index8] += s1 * wq.x * (e1.x - e2.x); +- float s7 = tex1Dfetch(tex_jacobian_sj, index * 2 + 1).w; +- jte[index8 + 7] += s7 * wq.y * (e1.y - e2.y); +- } else { +- jte[index8] += wq.x * (e1.x - e2.x); +- jte[index8 + 7] += wq.y * (e1.y - e2.y); +- } +-} +- +-void ProgramCU::ComputeJQtEC(CuTexImage& pe, CuTexImage& qlist, CuTexImage& wq, +- CuTexImage& sj, CuTexImage& jte) { +- int ncam = qlist.GetImgWidth(); +- const int bsize = 32; +- int nblock = (ncam + bsize - 1) / bsize; +- unsigned int bw, bh; +- GetBlockConfiguration(nblock, bw, bh); +- dim3 grid(bw, bh), block(bsize); +- +- pe.BindTexture(tex_jte_pe); +- qlist.BindTexture(tex_jte_q_idx); +- wq.BindTexture(tex_jte_q_w); +- +- if (sj.IsValid()) { +- sj.BindTexture(tex_jacobian_sj); +- jte_cam_q_kernel<<>>(ncam, (bw * bsize), jte.data()); +- } else { +- jte_cam_q_kernel<<>>(ncam, (bw * bsize), jte.data()); +- } +-} +- +-} // namespace pba +diff --git a/lib/PBA/ProgramCU.h b/lib/PBA/ProgramCU.h +deleted file mode 100644 +index d3d8af609..000000000 +--- a/lib/PBA/ProgramCU.h ++++ /dev/null +@@ -1,127 +0,0 @@ +-//////////////////////////////////////////////////////////////////////////// +-// File: ProgramCU.h +-// Author: Changchang Wu +-// Description : interface for the ProgramCU classes. +-// It is basically a wrapper around all the CUDA kernels +-// +-// Copyright (c) 2011 Changchang Wu (ccwu@cs.washington.edu) +-// and the University of Washington at Seattle +-// +-// This library is free software; you can redistribute it and/or +-// modify it under the terms of the GNU General Public +-// License as published by the Free Software Foundation; either +-// Version 3 of the License, or (at your option) any later version. +-// +-// This library is distributed in the hope that it will be useful, +-// but WITHOUT ANY WARRANTY; without even the implied warranty of +-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +-// General Public License for more details. +-// +-//////////////////////////////////////////////////////////////////////////////// +- +-#ifndef _PROGRAM_CU_H +-#define _PROGRAM_CU_H +- +-class CuTexImage; +- +-namespace pba { +-namespace ProgramCU { +- +-int SetCudaDevice(int device); +-size_t GetCudaMemoryCap(); +-int CheckErrorCUDA(const char* location); +-void FinishWorkCUDA(); +-void ClearPreviousError(); +-void ResetCurrentDevice(); +-void GetBlockConfiguration(unsigned int nblock, unsigned int& bw, +- unsigned int& bh); +- +-////////////////////////////////////////////////////////// +-void ComputeSQRT(CuTexImage& tex); +-void ComputeRSQRT(CuTexImage& tex); +-void ComputeVXY(CuTexImage& texX, CuTexImage& texY, CuTexImage& result, +- unsigned int part = 0, unsigned int skip = 0); +-void ComputeSAXPY(float a, CuTexImage& texX, CuTexImage& texY, +- CuTexImage& result); +-void ComputeSAX(float a, CuTexImage& texX, CuTexImage& result); +-void ComputeSXYPZ(float a, CuTexImage& texX, CuTexImage& texY, CuTexImage& texZ, +- CuTexImage& result); +-float ComputeVectorMax(CuTexImage& vector, CuTexImage& buf); +-float ComputeVectorSum(CuTexImage& vector, CuTexImage& buf, int skip); +-double ComputeVectorNorm(CuTexImage& vector, CuTexImage& buf); +-double ComputeVectorNormW(CuTexImage& vector, CuTexImage& weight, +- CuTexImage& buf); +-double ComputeVectorDot(CuTexImage& vector1, CuTexImage& vector2, +- CuTexImage& buf); +- +-////////////////////////////////////////////////////////////////////////// +-void UncompressCamera(int ncam, CuTexImage& camera0, CuTexImage& result); +-void CompressCamera(int ncam, CuTexImage& camera0, CuTexImage& result); +-void UpdateCameraPoint(int ncam, CuTexImage& camera, CuTexImage& point, +- CuTexImage& delta, CuTexImage& new_camera, +- CuTexImage& new_point, int mode = 0); +- +-///////////////////////////////////////////////////////////////////////// +-void ComputeJacobian(CuTexImage& camera, CuTexImage& point, CuTexImage& jc, +- CuTexImage& jp, CuTexImage& proj_map, CuTexImage& sj, +- CuTexImage& meas, CuTexImage& cmlist, bool intrinsic_fixed, +- int radial_distortion, bool shuffle); +-void ComputeProjection(CuTexImage& camera, CuTexImage& point, CuTexImage& meas, +- CuTexImage& proj_map, CuTexImage& proj, int radial); +-void ComputeProjectionX(CuTexImage& camera, CuTexImage& point, CuTexImage& meas, +- CuTexImage& proj_map, CuTexImage& proj, int radial); +- +-bool ShuffleCameraJacobian(CuTexImage& jc, CuTexImage& map, CuTexImage& result); +- +-///////////////////////////////////////////////////////////// +-void ComputeDiagonal(CuTexImage& jc, CuTexImage& cmap, CuTexImage& jp, +- CuTexImage& pmap, CuTexImage& cmlist, CuTexImage& jtjd, +- CuTexImage& jtjdi, bool jc_transpose, int radial, +- bool add_existing_diagc); +-void MultiplyBlockConditioner(int ncam, int npoint, CuTexImage& blocks, +- CuTexImage& vector, CuTexImage& result, +- int radial, int mode = 0); +- +-//////////////////////////////////////////////////////////////////////////////// +-void ComputeProjectionQ(CuTexImage& camera, CuTexImage& qmap, CuTexImage& qw, +- CuTexImage& proj, int offset); +-void ComputeJQX(CuTexImage& x, CuTexImage& qmap, CuTexImage& wq, CuTexImage& sj, +- CuTexImage& jx, int offset); +-void ComputeJQtEC(CuTexImage& pe, CuTexImage& qlist, CuTexImage& wq, +- CuTexImage& sj, CuTexImage& result); +-void ComputeDiagonalQ(CuTexImage& qlistw, CuTexImage& sj, CuTexImage& diag); +- +-////////////////////////////////////////////////////////////////////////// +-void ComputeJX(int point_offset, CuTexImage& x, CuTexImage& jc, CuTexImage& jp, +- CuTexImage& jmap, CuTexImage& result, int mode = 0); +-void ComputeJtE(CuTexImage& pe, CuTexImage& jc, CuTexImage& cmap, +- CuTexImage& cmlist, CuTexImage& jp, CuTexImage& pmap, +- CuTexImage& jte, bool jc_transpose, int mode = 0); +-void ComputeDiagonalBlock(float lambda, bool dampd, CuTexImage& jc, +- CuTexImage& cmap, CuTexImage& jp, CuTexImage& pmap, +- CuTexImage& cmlist, CuTexImage& diag, +- CuTexImage& blocks, int radial_distortion, +- bool jc_transpose, bool add_existing_diagc, +- int mode = 0); +- +-///////////////////////////////////////////////////////////////////// +-void ComputeJX_(CuTexImage& x, CuTexImage& jx, CuTexImage& camera, +- CuTexImage& point, CuTexImage& meas, CuTexImage& pjmap, +- bool intrinsic_fixed, int radial_distortion, int mode = 0); +-void ComputeJtE_(CuTexImage& e, CuTexImage& jte, CuTexImage& camera, +- CuTexImage& point, CuTexImage& meas, CuTexImage& cmap, +- CuTexImage& cmlist, CuTexImage& pmap, CuTexImage& jmap, +- CuTexImage& jp, bool intrinsic_fixed, int radial_distortion, +- int mode = 0); +-void ComputeDiagonalBlock_(float lambda, bool dampd, CuTexImage& camera, +- CuTexImage& point, CuTexImage& meas, +- CuTexImage& cmap, CuTexImage& cmlist, +- CuTexImage& pmap, CuTexImage& jmap, CuTexImage& jp, +- CuTexImage& sj, CuTexImage& diag, CuTexImage& blocks, +- bool intrinsic_fixed, int radial_distortion, +- bool add_existing_diagc, int mode = 0); +- +-} // namespace ProgramCU +-} // namespace pba +- +-#endif +diff --git a/lib/PBA/SparseBundleCPU.cpp b/lib/PBA/SparseBundleCPU.cpp +deleted file mode 100644 +index b03708209..000000000 +--- a/lib/PBA/SparseBundleCPU.cpp ++++ /dev/null +@@ -1,4369 +0,0 @@ +-//////////////////////////////////////////////////////////////////////////// +-// File: SparseBundleCPU.cpp +-// Author: Changchang Wu +-// Description : implementation of the CPU-based multicore bundle adjustment +-// +-// Copyright (c) 2011 Changchang Wu (ccwu@cs.washington.edu) +-// and the University of Washington at Seattle +-// +-// This library is free software; you can redistribute it and/or +-// modify it under the terms of the GNU General Public +-// License as published by the Free Software Foundation; either +-// Version 3 of the License, or (at your option) any later version. +-// +-// This library is distributed in the hope that it will be useful, +-// but WITHOUT ANY WARRANTY; without even the implied warranty of +-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +-// General Public License for more details. +-// +-//////////////////////////////////////////////////////////////////////////////// +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-using std::vector; +-using std::cout; +-using std::pair; +-using std::ofstream; +-using std::max; +- +-#include +-#include +-#include +-#include "pba.h" +-#include "SparseBundleCPU.h" +- +-#if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_FAMILY_APP +-#include +-#endif +- +-//#define POINT_DATA_ALIGN4 +-#if defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) +-#undef CPUPBA_USE_SSE +-#undef CPUPBA_USE_AVX +-#undef POINT_DATA_ALIGN4 +-#if defined(_M_ARM) && _M_ARM >= 7 && !defined(DISABLE_CPU_NEON) +-#include +-#define CPUPBA_USE_NEON +-#elif defined(__ARM_NEON) && !defined(DISABLE_CPU_NEON) +-#include +-#define CPUPBA_USE_NEON +-#endif +-#elif defined(__AVX__) && !defined(DISABLE_CPU_AVX) +-#include +-#define CPUPBA_USE_AVX +-#undef CPUPBA_USE_SSE +-#undef POINT_DATA_ALIGN4 +-#elif (defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86) && _M_IX86_FP >= 2)) && !defined(DISABLE_CPU_SSE) +-#define CPUPBA_USE_SSE +-#include +-#include +-#endif +- +-#ifdef POINT_DATA_ALIGN4 +-#define POINT_ALIGN 4 +-#else +-#define POINT_ALIGN 3 +-#endif +- +-#define POINT_ALIGN2 (POINT_ALIGN * 2) +- +-#ifdef _WIN32 +-#define NOMINMAX +-#include +-#define INLINESUFIX +-#define finite _finite +-#else +-#include +-#include +-#include +-#endif +- +-// maximum thread count +-#define THREAD_NUM_MAX 64 +-// compute the number of threads for vector operatoins, pure heuristics... +-#define AUTO_MT_NUM(sz) \ +- int((log((double)sz) / log(2.0) - 18.5) * __num_cpu_cores / 16.0) +- +-namespace pba { +- +-template +-void avec::SaveToFile(const char* name) { +- ofstream out(name); +- for (Float* p = _data; p < _last; ++p) out << (*p) << '\n'; +-} +- +-#ifdef CPUPBA_USE_SSE +-#define CPUPBA_USE_SIMD +-namespace MYSSE { +-template +-class SSE {}; +-template <> +-class SSE { +- public: +- typedef __m128 sse_type; +- static inline sse_type zero() { return _mm_setzero_ps(); } +-}; +-template <> +-class SSE { +- public: +- typedef __m128d sse_type; +- static inline sse_type zero() { return _mm_setzero_pd(); } +-}; +- +-//////////////////////////////////////////// +-template +-inline size_t sse_step() { +- return 16 / sizeof(Float); +-}; +-inline __m128 sse_load1(const float* p) { return _mm_load1_ps(p); } +-inline __m128 sse_load(const float* p) { return _mm_load_ps(p); } +-inline __m128 sse_add(__m128 s1, __m128 s2) { return _mm_add_ps(s1, s2); } +-inline __m128 sse_sub(__m128 s1, __m128 s2) { return _mm_sub_ps(s1, s2); } +-inline __m128 sse_mul(__m128 s1, __m128 s2) { return _mm_mul_ps(s1, s2); } +-inline __m128 sse_sqrt(__m128 s) { return _mm_sqrt_ps(s); } +- +-inline __m128d sse_load1(const double* p) { return _mm_load1_pd(p); } +-inline __m128d sse_load(const double* p) { return _mm_load_pd(p); } +-inline __m128d sse_add(__m128d s1, __m128d s2) { return _mm_add_pd(s1, s2); } +-inline __m128d sse_sub(__m128d s1, __m128d s2) { return _mm_sub_pd(s1, s2); } +-inline __m128d sse_mul(__m128d s1, __m128d s2) { return _mm_mul_pd(s1, s2); } +-inline __m128d sse_sqrt(__m128d s) { return _mm_sqrt_pd(s); } +- +-#ifdef _WIN32 +-inline float sse_sum(__m128 s) { +- return (s.m128_f32[0] + s.m128_f32[2]) + (s.m128_f32[1] + s.m128_f32[3]); +-} +-inline double sse_sum(__m128d s) { return s.m128d_f64[0] + s.m128d_f64[1]; } +-#else +-inline float sse_sum(__m128 s) { +- float* f = (float*)(&s); +- return (f[0] + f[2]) + (f[1] + f[3]); +-} +-inline double sse_sum(__m128d s) { +- double* d = (double*)(&s); +- return d[0] + d[1]; +-} +-#endif +-// inline float sse_dot(__m128 s1, __m128 s2) {__m128 temp = _mm_dp_ps(s1, +-// s2, 0xF1); float* f = (float*) (&temp); return f[0]; } +-// inline double sse_dot(__m128d s1, __m128d s2) {__m128d temp = +-// _mm_dp_pd(s1, s2, 0x31); double* f = (double*) (&temp); return f[0] ; } +-inline void sse_store(float* p, __m128 s) { _mm_store_ps(p, s); } +-inline void sse_store(double* p, __m128d s) { _mm_store_pd(p, s); } +- +-inline void data_prefetch(const void* p) { +- _mm_prefetch((const char*)p, _MM_HINT_NTA); +-} +-}; +- +-namespace ProgramCPU { +-using namespace MYSSE; +-#define SSE_ZERO SSE::zero() +-#define SSE_T typename SSE::sse_type +-///////////////////////////// +-inline void ScaleJ4(float* jcx, float* jcy, const float* sj) { +- __m128 ps = _mm_load_ps(sj); +- _mm_store_ps(jcx, _mm_mul_ps(_mm_load_ps(jcx), ps)); +- _mm_store_ps(jcy, _mm_mul_ps(_mm_load_ps(jcy), ps)); +-} +-inline void ScaleJ8(float* jcx, float* jcy, const float* sj) { +- ScaleJ4(jcx, jcy, sj); +- ScaleJ4(jcx + 4, jcy + 4, sj + 4); +-} +-inline void ScaleJ4(double* jcx, double* jcy, const double* sj) { +- __m128d ps1 = _mm_load_pd(sj), ps2 = _mm_load_pd(sj + 2); +- _mm_store_pd(jcx, _mm_mul_pd(_mm_load_pd(jcx), ps1)); +- _mm_store_pd(jcy, _mm_mul_pd(_mm_load_pd(jcy), ps1)); +- _mm_store_pd(jcx + 2, _mm_mul_pd(_mm_load_pd(jcx + 2), ps2)); +- _mm_store_pd(jcy + 2, _mm_mul_pd(_mm_load_pd(jcy + 2), ps2)); +-} +-inline void ScaleJ8(double* jcx, double* jcy, const double* sj) { +- ScaleJ4(jcx, jcy, sj); +- ScaleJ4(jcx + 4, jcy + 4, sj + 4); +-} +-inline float DotProduct8(const float* v1, const float* v2) { +- __m128 ds = _mm_add_ps(_mm_mul_ps(_mm_load_ps(v1), _mm_load_ps(v2)), +- _mm_mul_ps(_mm_load_ps(v1 + 4), _mm_load_ps(v2 + 4))); +- return sse_sum(ds); +-} +-inline double DotProduct8(const double* v1, const double* v2) { +- __m128d d1 = _mm_mul_pd(_mm_load_pd(v1), _mm_load_pd(v2)); +- __m128d d2 = _mm_mul_pd(_mm_load_pd(v1 + 2), _mm_load_pd(v2 + 2)); +- __m128d d3 = _mm_mul_pd(_mm_load_pd(v1 + 4), _mm_load_pd(v2 + 4)); +- __m128d d4 = _mm_mul_pd(_mm_load_pd(v1 + 6), _mm_load_pd(v2 + 6)); +- __m128d ds = _mm_add_pd(_mm_add_pd(d1, d2), _mm_add_pd(d3, d4)); +- return sse_sum(ds); +-} +- +-inline void ComputeTwoJX(const float* jc, const float* jp, const float* xc, +- const float* xp, float* jx) { +-#ifdef POINT_DATA_ALIGN4 +- __m128 xc1 = _mm_load_ps(xc), xc2 = _mm_load_ps(xc + 4), +- mxp = _mm_load_ps(xp); +- __m128 ds1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(jc), xc1), +- _mm_mul_ps(_mm_load_ps(jc + 4), xc2)); +- __m128 dx1 = _mm_add_ps(ds1, _mm_mul_ps(_mm_load_ps(jp), mxp)); +- jx[0] = sse_sum(dx1); +- __m128 ds2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(jc + 8), xc1), +- _mm_mul_ps(_mm_load_ps(jc + 12), xc2)); +- __m128 dx2 = _mm_add_ps(ds2, _mm_mul_ps(_mm_load_ps(jp + 4), mxp)); +- jx[1] = sse_sum(dx2); +-#else +- __m128 xc1 = _mm_load_ps(xc), xc2 = _mm_load_ps(xc + 4); +- __m128 jc1 = _mm_load_ps(jc), jc2 = _mm_load_ps(jc + 4); +- __m128 jc3 = _mm_load_ps(jc + 8), jc4 = _mm_load_ps(jc + 12); +- __m128 ds1 = _mm_add_ps(_mm_mul_ps(jc1, xc1), _mm_mul_ps(jc2, xc2)); +- __m128 ds2 = _mm_add_ps(_mm_mul_ps(jc3, xc1), _mm_mul_ps(jc4, xc2)); +- jx[0] = sse_sum(ds1) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]); +- jx[1] = +- sse_sum(ds2) + (jp[POINT_ALIGN] * xp[0] + jp[POINT_ALIGN + 1] * xp[1] + +- jp[POINT_ALIGN + 2] * xp[2]); +-/*jx[0] = (sse_dot(jc1, xc1) + sse_dot(jc2, xc2)) + (jp[0] * xp[0] + jp[1] * +-xp[1] + jp[2] * xp[2]); +-jx[1] = (sse_dot(jc3, xc1) + sse_dot(jc4, xc2)) + (jp[POINT_ALIGN] * xp[0] + +-jp[POINT_ALIGN+1] * xp[1] + jp[POINT_ALIGN+2] * xp[2]);*/ +-#endif +-} +- +-inline void ComputeTwoJX(const double* jc, const double* jp, const double* xc, +- const double* xp, double* jx) { +- __m128d xc1 = _mm_load_pd(xc), xc2 = _mm_load_pd(xc + 2), +- xc3 = _mm_load_pd(xc + 4), xc4 = _mm_load_pd(xc + 6); +- __m128d d1 = _mm_mul_pd(_mm_load_pd(jc), xc1); +- __m128d d2 = _mm_mul_pd(_mm_load_pd(jc + 2), xc2); +- __m128d d3 = _mm_mul_pd(_mm_load_pd(jc + 4), xc3); +- __m128d d4 = _mm_mul_pd(_mm_load_pd(jc + 6), xc4); +- __m128d ds1 = _mm_add_pd(_mm_add_pd(d1, d2), _mm_add_pd(d3, d4)); +- jx[0] = sse_sum(ds1) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]); +- +- __m128d d5 = _mm_mul_pd(_mm_load_pd(jc + 8), xc1); +- __m128d d6 = _mm_mul_pd(_mm_load_pd(jc + 10), xc2); +- __m128d d7 = _mm_mul_pd(_mm_load_pd(jc + 12), xc3); +- __m128d d8 = _mm_mul_pd(_mm_load_pd(jc + 14), xc4); +- __m128d ds2 = _mm_add_pd(_mm_add_pd(d5, d6), _mm_add_pd(d7, d8)); +- jx[1] = +- sse_sum(ds2) + (jp[POINT_ALIGN] * xp[0] + jp[POINT_ALIGN + 1] * xp[1] + +- jp[POINT_ALIGN + 2] * xp[2]); +-} +- +-// v += ax +-inline void AddScaledVec8(float a, const float* x, float* v) { +- __m128 aa = sse_load1(&a); +- _mm_store_ps(v, _mm_add_ps(_mm_mul_ps(_mm_load_ps(x), aa), _mm_load_ps(v))); +- _mm_store_ps(v + 4, _mm_add_ps(_mm_mul_ps(_mm_load_ps(x + 4), aa), +- _mm_load_ps(v + 4))); +-} +-// v += ax +-inline void AddScaledVec8(double a, const double* x, double* v) { +- __m128d aa = sse_load1(&a); +- _mm_store_pd(v, _mm_add_pd(_mm_mul_pd(_mm_load_pd(x), aa), _mm_load_pd(v))); +- _mm_store_pd(v + 2, _mm_add_pd(_mm_mul_pd(_mm_load_pd(x + 2), aa), +- _mm_load_pd(v + 2))); +- _mm_store_pd(v + 4, _mm_add_pd(_mm_mul_pd(_mm_load_pd(x + 4), aa), +- _mm_load_pd(v + 4))); +- _mm_store_pd(v + 6, _mm_add_pd(_mm_mul_pd(_mm_load_pd(x + 6), aa), +- _mm_load_pd(v + 6))); +-} +- +-inline void AddBlockJtJ(const float* jc, float* block, int vn) { +- __m128 j1 = _mm_load_ps(jc); +- __m128 j2 = _mm_load_ps(jc + 4); +- for (int i = 0; i < vn; ++i, ++jc, block += 8) { +- __m128 a = sse_load1(jc); +- _mm_store_ps(block + 0, +- _mm_add_ps(_mm_mul_ps(a, j1), _mm_load_ps(block + 0))); +- _mm_store_ps(block + 4, +- _mm_add_ps(_mm_mul_ps(a, j2), _mm_load_ps(block + 4))); +- } +-} +- +-inline void AddBlockJtJ(const double* jc, double* block, int vn) { +- __m128d j1 = _mm_load_pd(jc); +- __m128d j2 = _mm_load_pd(jc + 2); +- __m128d j3 = _mm_load_pd(jc + 4); +- __m128d j4 = _mm_load_pd(jc + 6); +- for (int i = 0; i < vn; ++i, ++jc, block += 8) { +- __m128d a = sse_load1(jc); +- _mm_store_pd(block + 0, +- _mm_add_pd(_mm_mul_pd(a, j1), _mm_load_pd(block + 0))); +- _mm_store_pd(block + 2, +- _mm_add_pd(_mm_mul_pd(a, j2), _mm_load_pd(block + 2))); +- _mm_store_pd(block + 4, +- _mm_add_pd(_mm_mul_pd(a, j3), _mm_load_pd(block + 4))); +- _mm_store_pd(block + 6, +- _mm_add_pd(_mm_mul_pd(a, j4), _mm_load_pd(block + 6))); +- } +-} +-}; +-#endif +- +-#ifdef CPUPBA_USE_AVX +-#define CPUPBA_USE_SIMD +-namespace MYAVX { +-template +-class SSE {}; +-template <> +-class SSE { +- public: +- typedef __m256 sse_type; // static size_t step() {return 4;} +- static inline sse_type zero() { return _mm256_setzero_ps(); } +-}; +-template <> +-class SSE { +- public: +- typedef __m256d sse_type; // static size_t step() {return 2;} +- static inline sse_type zero() { return _mm256_setzero_pd(); } +-}; +- +-//////////////////////////////////////////// +-template +-inline size_t sse_step() { +- return 32 / sizeof(Float); +-}; +-inline __m256 sse_load1(const float* p) { return _mm256_broadcast_ss(p); } +-inline __m256 sse_load(const float* p) { return _mm256_load_ps(p); } +-inline __m256 sse_add(__m256 s1, __m256 s2) { return _mm256_add_ps(s1, s2); } +-inline __m256 sse_sub(__m256 s1, __m256 s2) { return _mm256_sub_ps(s1, s2); } +-inline __m256 sse_mul(__m256 s1, __m256 s2) { return _mm256_mul_ps(s1, s2); } +-inline __m256 sse_sqrt(__m256 s) { return _mm256_sqrt_ps(s); } +- +-// inline __m256 sse_fmad(__m256 a, __m256 b, __m256 c) {return +-// _mm256_fmadd_ps(a, b, c);} +- +-inline __m256d sse_load1(const double* p) { return _mm256_broadcast_sd(p); } +-inline __m256d sse_load(const double* p) { return _mm256_load_pd(p); } +-inline __m256d sse_add(__m256d s1, __m256d s2) { return _mm256_add_pd(s1, s2); } +-inline __m256d sse_sub(__m256d s1, __m256d s2) { return _mm256_sub_pd(s1, s2); } +-inline __m256d sse_mul(__m256d s1, __m256d s2) { return _mm256_mul_pd(s1, s2); } +-inline __m256d sse_sqrt(__m256d s) { return _mm256_sqrt_pd(s); } +- +-#ifdef _WIN32 +-inline float sse_sum(__m256 s) { +- return ((s.m256_f32[0] + s.m256_f32[4]) + (s.m256_f32[2] + s.m256_f32[6])) + +- ((s.m256_f32[1] + s.m256_f32[5]) + (s.m256_f32[3] + s.m256_f32[7])); +-} +-inline double sse_sum(__m256d s) { +- return (s.m256d_f64[0] + s.m256d_f64[2]) + (s.m256d_f64[1] + s.m256d_f64[3]); +-} +-#else +-inline float sse_sum(__m256 s) { +- float* f = (float*)(&s); +- return ((f[0] + f[4]) + (f[2] + f[6])) + ((f[1] + f[5]) + (f[3] + f[7])); +-} +-inline double sse_sum(__m256d s) { +- double* d = (double*)(&s); +- return (d[0] + d[2]) + (d[1] + d[3]); +-} +-#endif +-inline float sse_dot(__m256 s1, __m256 s2) { +- __m256 temp = _mm256_dp_ps(s1, s2, 0xf1); +- float* f = (float*)(&temp); +- return f[0] + f[4]; +-} +-inline double sse_dot(__m256d s1, __m256d s2) { +- return sse_sum(sse_mul(s1, s2)); +-} +- +-inline void sse_store(float* p, __m256 s) { _mm256_store_ps(p, s); } +-inline void sse_store(double* p, __m256d s) { _mm256_store_pd(p, s); } +- +-inline void data_prefetch(const void* p) { +- _mm_prefetch((const char*)p, _MM_HINT_NTA); +-} +-}; +- +-namespace ProgramCPU { +-using namespace MYAVX; +-#define SSE_ZERO SSE::zero() +-#define SSE_T typename SSE::sse_type +- +-///////////////////////////// +-inline void ScaleJ8(float* jcx, float* jcy, const float* sj) { +- __m256 ps = _mm256_load_ps(sj); +- _mm256_store_ps(jcx, _mm256_mul_ps(_mm256_load_ps(jcx), ps)); +- _mm256_store_ps(jcy, _mm256_mul_ps(_mm256_load_ps(jcy), ps)); +-} +-inline void ScaleJ4(double* jcx, double* jcy, const double* sj) { +- __m256d ps = _mm256_load_pd(sj); +- _mm256_store_pd(jcx, _mm256_mul_pd(_mm256_load_pd(jcx), ps)); +- _mm256_store_pd(jcy, _mm256_mul_pd(_mm256_load_pd(jcy), ps)); +-} +-inline void ScaleJ8(double* jcx, double* jcy, const double* sj) { +- ScaleJ4(jcx, jcy, sj); +- ScaleJ4(jcx + 4, jcy + 4, sj + 4); +-} +-inline float DotProduct8(const float* v1, const float* v2) { +- return sse_dot(_mm256_load_ps(v1), _mm256_load_ps(v2)); +-} +-inline double DotProduct8(const double* v1, const double* v2) { +- __m256d ds = _mm256_add_pd( +- _mm256_mul_pd(_mm256_load_pd(v1), _mm256_load_pd(v2)), +- _mm256_mul_pd(_mm256_load_pd(v1 + 4), _mm256_load_pd(v2 + 4))); +- return sse_sum(ds); +-} +- +-inline void ComputeTwoJX(const float* jc, const float* jp, const float* xc, +- const float* xp, float* jx) { +- __m256 xcm = _mm256_load_ps(xc), jc1 = _mm256_load_ps(jc), +- jc2 = _mm256_load_ps(jc + 8); +- jx[0] = sse_dot(jc1, xcm) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]); +- jx[1] = sse_dot(jc2, xcm) + +- (jp[POINT_ALIGN] * xp[0] + jp[POINT_ALIGN + 1] * xp[1] + +- jp[POINT_ALIGN + 2] * xp[2]); +-} +- +-inline void ComputeTwoJX(const double* jc, const double* jp, const double* xc, +- const double* xp, double* jx) { +- __m256d xc1 = _mm256_load_pd(xc), xc2 = _mm256_load_pd(xc + 4); +- __m256d jc1 = _mm256_load_pd(jc), jc2 = _mm256_load_pd(jc + 4); +- __m256d jc3 = _mm256_load_pd(jc + 8), jc4 = _mm256_load_pd(jc + 12); +- __m256d ds1 = _mm256_add_pd(_mm256_mul_pd(jc1, xc1), _mm256_mul_pd(jc2, xc2)); +- __m256d ds2 = _mm256_add_pd(_mm256_mul_pd(jc3, xc1), _mm256_mul_pd(jc4, xc2)); +- jx[0] = sse_sum(ds1) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]); +- jx[1] = +- sse_sum(ds2) + (jp[POINT_ALIGN] * xp[0] + jp[POINT_ALIGN + 1] * xp[1] + +- jp[POINT_ALIGN + 2] * xp[2]); +-} +- +-// v += ax +-inline void AddScaledVec8(float a, const float* x, float* v) { +- __m256 aa = sse_load1(&a); +- _mm256_store_ps(v, _mm256_add_ps(_mm256_mul_ps(_mm256_load_ps(x), aa), +- _mm256_load_ps(v))); +- //_mm256_store_ps(v, _mm256_fmadd_ps(_mm256_load_ps(x), aa, +- //_mm256_load_ps(v))); +-} +-// v += ax +-inline void AddScaledVec8(double a, const double* x, double* v) { +- __m256d aa = sse_load1(&a); +- _mm256_store_pd(v, _mm256_add_pd(_mm256_mul_pd(_mm256_load_pd(x), aa), +- _mm256_load_pd(v))); +- _mm256_store_pd(v + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_load_pd(x + 4), aa), +- _mm256_load_pd(v + 4))); +-} +- +-inline void AddBlockJtJ(const float* jc, float* block, int vn) { +- __m256 j = _mm256_load_ps(jc); +- for (int i = 0; i < vn; ++i, ++jc, block += 8) { +- __m256 a = sse_load1(jc); +- _mm256_store_ps(block, +- _mm256_add_ps(_mm256_mul_ps(a, j), _mm256_load_ps(block))); +- } +-} +- +-inline void AddBlockJtJ(const double* jc, double* block, int vn) { +- __m256d j1 = _mm256_load_pd(jc); +- __m256d j2 = _mm256_load_pd(jc + 4); +- for (int i = 0; i < vn; ++i, ++jc, block += 8) { +- __m256d a = sse_load1(jc); +- _mm256_store_pd(block + 0, _mm256_add_pd(_mm256_mul_pd(a, j1), +- _mm256_load_pd(block + 0))); +- _mm256_store_pd(block + 4, _mm256_add_pd(_mm256_mul_pd(a, j2), +- _mm256_load_pd(block + 4))); +- } +-} +-}; +- +-#endif +- +-#ifdef CPUPBA_USE_NEON +-#define CPUPBA_USE_SIMD +-#define SIMD_NO_SQRT +-#define SIMD_NO_DOUBLE +-namespace MYNEON { +-template +-class SSE {}; +-template <> +-class SSE { +- public: +- typedef float32x4_t sse_type; +-}; +- +-//////////////////////////////////////////// +-template +-inline size_t sse_step() { +- return 16 / sizeof(Float); +-}; +-inline float32x4_t sse_load1(const float* p) { return vld1q_dup_f32(p); } +-inline float32x4_t sse_load(const float* p) { return vld1q_f32(p); } +-inline float32x4_t sse_loadzero() { +- float z = 0; +- return sse_load1(&z); +-} +-inline float32x4_t sse_add(float32x4_t s1, float32x4_t s2) { +- return vaddq_f32(s1, s2); +-} +-inline float32x4_t sse_sub(float32x4_t s1, float32x4_t s2) { +- return vsubq_f32(s1, s2); +-} +-inline float32x4_t sse_mul(float32x4_t s1, float32x4_t s2) { +- return vmulq_f32(s1, s2); +-} +-// inline float32x4_t sse_sqrt(float32x4_t s) {return +-// _mm_sqrt_ps(s); } +-inline float sse_sum(float32x4_t s) { +- float* f = (float*)(&s); +- return (f[0] + f[2]) + (f[1] + f[3]); +-} +-inline void sse_store(float* p, float32x4_t s) { vst1q_f32(p, s); } +-inline void data_prefetch(const void* p) {} +-}; +-namespace ProgramCPU { +-using namespace MYNEON; +-#define SSE_ZERO sse_loadzero() +-#define SSE_T typename SSE::sse_type +-///////////////////////////// +-inline void ScaleJ4(float* jcx, float* jcy, const float* sj) { +- float32x4_t ps = sse_load(sj); +- sse_store(jcx, sse_mul(sse_load(jcx), ps)); +- sse_store(jcy, sse_mul(sse_load(jcy), ps)); +-} +-inline void ScaleJ8(float* jcx, float* jcy, const float* sj) { +- ScaleJ4(jcx, jcy, sj); +- ScaleJ4(jcx + 4, jcy + 4, sj + 4); +-} +- +-inline float DotProduct8(const float* v1, const float* v2) { +- float32x4_t ds = sse_add(sse_mul(sse_load(v1), sse_load(v2)), +- sse_mul(sse_load(v1 + 4), sse_load(v2 + 4))); +- return sse_sum(ds); +-} +- +-inline void ComputeTwoJX(const float* jc, const float* jp, const float* xc, +- const float* xp, float* jx) { +-#ifdef POINT_DATA_ALIGN4 +- float32x4_t xc1 = sse_load(xc), xc2 = sse_load(xc + 4), mxp = sse_load(xp); +- float32x4_t ds1 = +- sse_add(sse_mul(sse_load(jc), xc1), sse_mul(sse_load(jc + 4), xc2)); +- float32x4_t dx1 = sse_add(ds1, sse_mul(sse_load(jp), mxp)); +- jx[0] = sse_sum(dx1); +- float32x4_t ds2 = +- sse_add(sse_mul(sse_load(jc + 8), xc1), sse_mul(sse_load(jc + 12), xc2)); +- float32x4_t dx2 = sse_add(ds2, sse_mul(sse_load(jp + 4), mxp)); +- jx[1] = sse_sum(dx2); +-#else +- float32x4_t xc1 = sse_load(xc), xc2 = sse_load(xc + 4); +- float32x4_t jc1 = sse_load(jc), jc2 = sse_load(jc + 4); +- float32x4_t jc3 = sse_load(jc + 8), jc4 = sse_load(jc + 12); +- float32x4_t ds1 = sse_add(sse_mul(jc1, xc1), sse_mul(jc2, xc2)); +- float32x4_t ds2 = sse_add(sse_mul(jc3, xc1), sse_mul(jc4, xc2)); +- jx[0] = sse_sum(ds1) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]); +- jx[1] = +- sse_sum(ds2) + (jp[POINT_ALIGN] * xp[0] + jp[POINT_ALIGN + 1] * xp[1] + +- jp[POINT_ALIGN + 2] * xp[2]); +-/*jx[0] = (sse_dot(jc1, xc1) + sse_dot(jc2, xc2)) + (jp[0] * xp[0] + jp[1] * +-xp[1] + jp[2] * xp[2]); +-jx[1] = (sse_dot(jc3, xc1) + sse_dot(jc4, xc2)) + (jp[POINT_ALIGN] * xp[0] + +-jp[POINT_ALIGN+1] * xp[1] + jp[POINT_ALIGN+2] * xp[2]);*/ +-#endif +-} +- +-// v += ax +-inline void AddScaledVec8(float a, const float* x, float* v) { +- float32x4_t aa = sse_load1(&a); +- sse_store(v, sse_add(sse_mul(sse_load(x), aa), sse_load(v))); +- sse_store(v + 4, sse_add(sse_mul(sse_load(x + 4), aa), sse_load(v + 4))); +-} +- +-inline void AddBlockJtJ(const float* jc, float* block, int vn) { +- float32x4_t j1 = sse_load(jc); +- float32x4_t j2 = sse_load(jc + 4); +- for (int i = 0; i < vn; ++i, ++jc, block += 8) { +- float32x4_t a = sse_load1(jc); +- sse_store(block + 0, sse_add(sse_mul(a, j1), sse_load(block + 0))); +- sse_store(block + 4, sse_add(sse_mul(a, j2), sse_load(block + 4))); +- } +-} +-}; +-#endif +- +-namespace ProgramCPU { +-int __num_cpu_cores = 0; +-template +-double ComputeVectorNorm(const avec& vec, int mt = 0); +- +-#if defined(CPUPBA_USE_SIMD) +-template +-void ComputeSQRT(avec& vec) { +-#ifndef SIMD_NO_SQRT +- const size_t step = sse_step(); +- Float *p = &vec[0], *pe = p + vec.size(), *pex = pe - step; +- for (; p <= pex; p += step) sse_store(p, sse_sqrt(sse_load(p))); +- for (; p < pe; ++p) p[0] = sqrt(p[0]); +-#else +- for (Float* it = vec.begin(); it < vec.end(); ++it) *it = sqrt(*it); +-#endif +-} +- +-template +-void ComputeRSQRT(avec& vec) { +- Float *p = &vec[0], *pe = p + vec.size(); +- for (; p < pe; ++p) p[0] = (p[0] == 0 ? 0 : Float(1.0) / p[0]); +- ComputeSQRT(vec); +-} +- +-template +-void SetVectorZero(Float* p, Float* pe) { +- SSE_T sse = SSE_ZERO; +- const size_t step = sse_step(); +- Float* pex = pe - step; +- for (; p <= pex; p += step) sse_store(p, sse); +- for (; p < pe; ++p) *p = 0; +-} +- +-template +-void SetVectorZero(avec& vec) { +- Float *p = &vec[0], *pe = p + vec.size(); +- SetVectorZero(p, pe); +-} +- +-// function not used +-template +-inline void MemoryCopyA(const Float* p, const Float* pe, Float* d) { +- const size_t step = sse_step(); +- const Float* pex = pe - step; +- for (; p <= pex; p += step, d += step) sse_store(d, sse_load(p)); +- // while(p < pe) *d++ = *p++; +-} +- +-template +-void ComputeVectorNorm(const Float* p, const Float* pe, double* psum) { +- SSE_T sse = SSE_ZERO; +- const size_t step = sse_step(); +- const Float* pex = pe - step; +- for (; p <= pex; p += step) { +- SSE_T ps = sse_load(p); +- sse = sse_add(sse, sse_mul(ps, ps)); +- } +- double sum = sse_sum(sse); +- for (; p < pe; ++p) sum += p[0] * p[0]; +- *psum = sum; +-} +- +-template +-double ComputeVectorNormW(const avec& vec, const avec& weight) { +- if (weight.begin() != NULL) { +- SSE_T sse = SSE_ZERO; +- const size_t step = sse_step(); +- const Float *p = vec, *pe = p + vec.size(), *pex = pe - step; +- const Float* w = weight; +- for (; p <= pex; p += step, w += step) { +- SSE_T pw = sse_load(w), ps = sse_load(p); +- sse = sse_add(sse, sse_mul(sse_mul(ps, pw), ps)); +- } +- double sum = sse_sum(sse); +- for (; p < pe; ++p, ++w) sum += p[0] * w[0] * p[0]; +- return sum; +- } else { +- return ComputeVectorNorm(vec, 0); +- } +-} +- +-template +-double ComputeVectorDot(const avec& vec1, const avec& vec2) { +- SSE_T sse = SSE_ZERO; +- const size_t step = sse_step(); +- const Float *p1 = vec1, *pe = p1 + vec1.size(), *pex = pe - step; +- const Float* p2 = vec2; +- for (; p1 <= pex; p1 += step, p2 += step) { +- SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2); +- sse = sse_add(sse, sse_mul(ps1, ps2)); +- } +- double sum = sse_sum(sse); +- for (; p1 < pe; ++p1, ++p2) sum += p1[0] * p2[0]; +- return sum; +-} +- +-template +-void ComputeVXY(const avec& vec1, const avec& vec2, +- avec& result, size_t part = 0, size_t skip = 0) { +- const size_t step = sse_step(); +- const Float *p1 = vec1 + skip, *pe = p1 + (part ? part : vec1.size()), +- *pex = pe - step; +- const Float* p2 = vec2 + skip; +- Float* p3 = result + skip; +- for (; p1 <= pex; p1 += step, p2 += step, p3 += step) { +- SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2); +- sse_store(p3, sse_mul(ps1, ps2)); +- } +- for (; p1 < pe; ++p1, ++p2, ++p3) *p3 = p1[0] * p2[0]; +-} +- +-template +-void ComputeSAXPY(Float a, const Float* p1, const Float* p2, Float* p3, +- Float* pe) { +- const size_t step = sse_step(); +- SSE_T aa = sse_load1(&a); +- Float* pex = pe - step; +- if (a == 1.0f) { +- for (; p3 <= pex; p1 += step, p2 += step, p3 += step) { +- SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2); +- sse_store(p3, sse_add(ps2, ps1)); +- } +- } else if (a == -1.0f) { +- for (; p3 <= pex; p1 += step, p2 += step, p3 += step) { +- SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2); +- sse_store(p3, sse_sub(ps2, ps1)); +- } +- } else { +- for (; p3 <= pex; p1 += step, p2 += step, p3 += step) { +- SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2); +- sse_store(p3, sse_add(ps2, sse_mul(ps1, aa))); +- } +- } +- for (; p3 < pe; ++p1, ++p2, ++p3) p3[0] = a * p1[0] + p2[0]; +-} +- +-template +-void ComputeSAX(Float a, const avec& vec1, avec& result) { +- const size_t step = sse_step(); +- SSE_T aa = sse_load1(&a); +- const Float *p1 = vec1, *pe = p1 + vec1.size(), *pex = pe - step; +- Float* p3 = result; +- for (; p1 <= pex; p1 += step, p3 += step) { +- sse_store(p3, sse_mul(sse_load(p1), aa)); +- } +- for (; p1 < pe; ++p1, ++p3) p3[0] = a * p1[0]; +-} +- +-template +-inline void ComputeSXYPZ(Float a, const Float* p1, const Float* p2, +- const Float* p3, Float* p4, Float* pe) { +- const size_t step = sse_step(); +- SSE_T aa = sse_load1(&a); +- Float* pex = pe - step; +- for (; p4 <= pex; p1 += step, p2 += step, p3 += step, p4 += step) { +- SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2), ps3 = sse_load(p3); +- sse_store(p4, sse_add(ps3, sse_mul(sse_mul(ps1, aa), ps2))); +- } +- for (; p4 < pe; ++p1, ++p2, ++p3, ++p4) p4[0] = a * p1[0] * p2[0] + p3[0]; +-} +- +-#else +-template +-void ComputeSQRT(avec& vec) { +- Float* it = vec.begin(); +- for (; it < vec.end(); ++it) { +- *it = sqrt(*it); +- } +-} +-template +-void ComputeRSQRT(avec& vec) { +- Float* it = vec.begin(); +- for (; it < vec.end(); ++it) { +- *it = (*it == 0 ? 0 : Float(1.0) / sqrt(*it)); +- } +-} +-template +-inline void SetVectorZero(Float* p, Float* pe) { +- std::fill(p, pe, 0); +-} +-template +-inline void SetVectorZero(avec& vec) { +- std::fill(vec.begin(), vec.end(), 0); +-} +- +-template +-inline void MemoryCopyA(const Float* p, const Float* pe, Float* d) { +- while (p < pe) *d++ = *p++; +-} +- +-template +-double ComputeVectorNormW(const avec& vec, const avec& weight) { +- double sum = 0; +- const Float *it1 = vec.begin(), *it2 = weight.begin(); +- for (; it1 < vec.end(); ++it1, ++it2) { +- sum += (*it1) * (*it2) * (*it1); +- } +- return sum; +-} +- +-template +-double ComputeVectorDot(const avec& vec1, const avec& vec2) { +- double sum = 0; +- const Float *it1 = vec1.begin(), *it2 = vec2.begin(); +- for (; it1 < vec1.end(); ++it1, ++it2) { +- sum += (*it1) * (*it2); +- } +- return sum; +-} +-template +-void ComputeVectorNorm(const Float* p, const Float* pe, double* psum) { +- double sum = 0; +- for (; p < pe; ++p) sum += (*p) * (*p); +- *psum = sum; +-} +-template +-inline void ComputeVXY(const avec& vec1, const avec& vec2, +- avec& result, size_t part = 0, size_t skip = 0) { +- const Float *it1 = vec1.begin() + skip, *it2 = vec2.begin() + skip; +- const Float* ite = part ? (it1 + part) : vec1.end(); +- Float* it3 = result.begin() + skip; +- for (; it1 < ite; ++it1, ++it2, ++it3) { +- (*it3) = (*it1) * (*it2); +- } +-} +-template +-void ScaleJ8(Float* jcx, Float* jcy, const Float* sj) { +- for (int i = 0; i < 8; ++i) { +- jcx[i] *= sj[i]; +- jcy[i] *= sj[i]; +- } +-} +- +-template +-inline void AddScaledVec8(Float a, const Float* x, Float* v) { +- for (int i = 0; i < 8; ++i) v[i] += (a * x[i]); +-} +- +-template +-void ComputeSAX(Float a, const avec& vec1, avec& result) { +- const Float* it1 = vec1.begin(); +- Float* it3 = result.begin(); +- for (; it1 < vec1.end(); ++it1, ++it3) { +- (*it3) = (a * (*it1)); +- } +-} +- +-template +-inline void ComputeSXYPZ(Float a, const Float* p1, const Float* p2, +- const Float* p3, Float* p4, Float* pe) { +- for (; p4 < pe; ++p1, ++p2, ++p3, ++p4) *p4 = (a * (*p1) * (*p2) + (*p3)); +-} +- +-template +-void ComputeSAXPY(Float a, const Float* it1, const Float* it2, Float* it3, +- Float* ite) { +- if (a == (Float)1.0) { +- for (; it3 < ite; ++it1, ++it2, ++it3) { +- (*it3) = ((*it1) + (*it2)); +- } +- } else { +- for (; it3 < ite; ++it1, ++it2, ++it3) { +- (*it3) = (a * (*it1) + (*it2)); +- } +- } +-} +-template +-void AddBlockJtJ(const Float* jc, Float* block, int vn) { +- for (int i = 0; i < vn; ++i) { +- Float *row = block + i * 8, a = jc[i]; +- for (int j = 0; j < vn; ++j) row[j] += a * jc[j]; +- } +-} +-#endif +- +-#ifdef _WIN32 +-#define DEFINE_THREAD_DATA(X) \ +- template \ +- struct X##_STRUCT { +-#define DECLEAR_THREAD_DATA(X, ...) \ +- X##_STRUCT tdata = {__VA_ARGS__}; \ +- X##_STRUCT* newdata = new X##_STRUCT(tdata) +-#define BEGIN_THREAD_PROC(X) \ +- } \ +- ; \ +- template \ +- DWORD X##_PROC(X##_STRUCT* q) { +-#define END_THREAD_RPOC(X) \ +- delete q; \ +- return 0; \ +- } +- +-#if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_FAMILY_APP +-#define MYTHREAD std::thread +-#define RUN_THREAD(X, t, ...) \ +- DECLEAR_THREAD_DATA(X, __VA_ARGS__); \ +- t = std::thread(X##_PROC, newdata) +-#define WAIT_THREAD(tv, n) \ +- { \ +- for (size_t i = 0; i < size_t(n); ++i) tv[i].join(); \ +- } +-#else +-#define MYTHREAD HANDLE +-#define RUN_THREAD(X, t, ...) \ +- DECLEAR_THREAD_DATA(X, __VA_ARGS__); \ +- t = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)X##_PROC, newdata, \ +- 0, 0) +-#define WAIT_THREAD(tv, n) \ +- { \ +- WaitForMultipleObjects((DWORD)n, tv, TRUE, INFINITE); \ +- for (size_t i = 0; i < size_t(n); ++i) CloseHandle(tv[i]); \ +- } +-#endif +-#else +-#define DEFINE_THREAD_DATA(X) \ +- template \ +- struct X##_STRUCT { \ +- int tid; +-#define DECLEAR_THREAD_DATA(X, ...) \ +- X##_STRUCT tdata = {i, __VA_ARGS__}; \ +- X##_STRUCT* newdata = new X##_STRUCT(tdata) +-#define BEGIN_THREAD_PROC(X) \ +- } \ +- ; \ +- template \ +- void* X##_PROC(X##_STRUCT* q) { +-// cpu_set_t mask; CPU_ZERO( &mask ); +-// CPU_SET( q->tid, &mask ); +-// if( sched_setaffinity(0, sizeof(mask), &mask +-// ) == -1 ) +-// std::cout <<"WARNING: Could not set CPU +-// Affinity, continuing...\n"; +-#define END_THREAD_RPOC(X) \ +- delete q; \ +- return 0; \ +- } \ +- template \ +- struct X##_FUNCTOR { \ +- typedef void* (*func_type)(X##_STRUCT*); \ +- static func_type get() { return &(X##_PROC); } \ +- }; +-#define MYTHREAD pthread_t +- +-#define RUN_THREAD(X, t, ...) \ +- DECLEAR_THREAD_DATA(X, __VA_ARGS__); \ +- pthread_create(&t, NULL, (void* (*)(void*))X##_FUNCTOR::get(), newdata) +-#define WAIT_THREAD(tv, n) \ +- { \ +- for (size_t i = 0; i < size_t(n); ++i) pthread_join(tv[i], NULL); \ +- } +-#endif +-template +-inline void MemoryCopyB(const Float* p, const Float* pe, Float* d) { +- while (p < pe) *d++ = *p++; +-} +- +-template +-inline Float DotProduct8(const Float* v1, const Float* v2) { +- return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2] + v1[3] * v2[3] + +- v1[4] * v2[4] + v1[5] * v2[5] + v1[6] * v2[6] + v1[7] * v2[7]; +-} +-template +-inline void ComputeTwoJX(const Float* jc, const Float* jp, const Float* xc, +- const Float* xp, Float* jx) { +- jx[0] = DotProduct8(jc, xc) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]); +- jx[1] = +- DotProduct8(jc + 8, xc) + (jp[3] * xp[0] + jp[4] * xp[1] + jp[5] * xp[2]); +-} +-template +-Float ComputeVectorMax(const avec& vec) { +- Float v = 0; +- const Float* it = vec.begin(); +- for (; it < vec.end(); ++it) { +- Float vi = (Float)fabs(*it); +- v = std::max(v, vi); +- } +- return v; +-} +- +-template +-void ComputeSXYPZ(Float a, const avec& vec1, const avec& vec2, +- const avec& vec3, avec& result) { +- if (vec1.begin() != NULL) { +- const Float *p1 = &vec1[0], *p2 = &vec2[0], *p3 = &vec3[0]; +- Float *p4 = &result[0], *pe = p4 + result.size(); +- ComputeSXYPZ(a, p1, p2, p3, p4, pe); +- +- } else { +- // ComputeSAXPY(a, vec2, vec3, result, 0); +- ComputeSAXPY(a, vec2.begin(), vec3.begin(), result.begin(), +- result.end()); +- } +-} +- +-DEFINE_THREAD_DATA(ComputeSAXPY) +-Float a; +-const Float *p1, *p2; +-Float *p3, *pe; +-BEGIN_THREAD_PROC(ComputeSAXPY) +-ComputeSAXPY(q->a, q->p1, q->p2, q->p3, q->pe); +-END_THREAD_RPOC(ComputeSAXPY) +- +-template +-void ComputeSAXPY(Float a, const avec& vec1, const avec& vec2, +- avec& result, int mt = 0) { +- const bool auto_multi_thread = true; +- if (auto_multi_thread && mt == 0) { +- mt = AUTO_MT_NUM(result.size() * 2); +- } +- if (mt > 1 && result.size() >= mt * 4) { +- MYTHREAD threads[THREAD_NUM_MAX]; +- const size_t thread_num = std::min(mt, THREAD_NUM_MAX); +- const Float *p1 = vec1.begin(), *p2 = vec2.begin(); +- Float* p3 = result.begin(); +- for (size_t i = 0; i < thread_num; ++i) { +- size_t first = (result.size() * i / thread_num + FLOAT_ALIGN - 1) / +- FLOAT_ALIGN * FLOAT_ALIGN; +- size_t last_ = (result.size() * (i + 1) / thread_num + FLOAT_ALIGN - 1) / +- FLOAT_ALIGN * FLOAT_ALIGN; +- size_t last = std::min(last_, result.size()); +- RUN_THREAD(ComputeSAXPY, threads[i], a, p1 + first, p2 + first, +- p3 + first, p3 + last); +- } +- WAIT_THREAD(threads, thread_num); +- } else { +- ComputeSAXPY(a, vec1.begin(), vec2.begin(), result.begin(), result.end()); +- } +-} +- +-DEFINE_THREAD_DATA(ComputeVectorNorm) +-const Float *p, *pe; +-double* sum; +-BEGIN_THREAD_PROC(ComputeVectorNorm) +-ComputeVectorNorm(q->p, q->pe, q->sum); +-END_THREAD_RPOC(ComputeVectorNorm) +- +-template +-double ComputeVectorNorm(const avec& vec, int mt) { +- const bool auto_multi_thread = true; +- if (auto_multi_thread && mt == 0) { +- mt = AUTO_MT_NUM(vec.size()); +- } +- if (mt > 1 && vec.size() >= mt * 4) { +- MYTHREAD threads[THREAD_NUM_MAX]; +- double sumv[THREAD_NUM_MAX]; +- const size_t thread_num = std::min(mt, THREAD_NUM_MAX); +- const Float* p = vec; +- for (size_t i = 0; i < thread_num; ++i) { +- size_t first = (vec.size() * i / thread_num + FLOAT_ALIGN - 1) / +- FLOAT_ALIGN * FLOAT_ALIGN; +- size_t last_ = (vec.size() * (i + 1) / thread_num + FLOAT_ALIGN - 1) / +- FLOAT_ALIGN * FLOAT_ALIGN; +- size_t last = std::min(last_, vec.size()); +- RUN_THREAD(ComputeVectorNorm, threads[i], p + first, p + last, sumv + i); +- } +- WAIT_THREAD(threads, thread_num); +- double sum = 0; +- for (size_t i = 0; i < thread_num; ++i) sum += sumv[i]; +- return sum; +- } else { +- double sum; +- ComputeVectorNorm(vec.begin(), vec.end(), &sum); +- return sum; +- } +-} +- +-template +-void GetRodriguesRotation(const Float m[3][3], Float r[3]) { +- // http://www.euclideanspace.com/maths/geometry/rotations/conversions/matrixToAngle/index.htm +- double a = (m[0][0] + m[1][1] + m[2][2] - 1.0) / 2.0; +- const double epsilon = 0.01; +- if (fabs(m[0][1] - m[1][0]) < epsilon && fabs(m[1][2] - m[2][1]) < epsilon && +- fabs(m[0][2] - m[2][0]) < epsilon) { +- if (fabs(m[0][1] + m[1][0]) < 0.1 && fabs(m[1][2] + m[2][1]) < 0.1 && +- fabs(m[0][2] + m[2][0]) < 0.1 && a > 0.9) { +- r[0] = 0; +- r[1] = 0; +- r[2] = 0; +- } else { +- const Float ha = Float(sqrt(0.5) * 3.14159265358979323846); +- double xx = (m[0][0] + 1.0) / 2.0; +- double yy = (m[1][1] + 1.0) / 2.0; +- double zz = (m[2][2] + 1.0) / 2.0; +- double xy = (m[0][1] + m[1][0]) / 4.0; +- double xz = (m[0][2] + m[2][0]) / 4.0; +- double yz = (m[1][2] + m[2][1]) / 4.0; +- +- if ((xx > yy) && (xx > zz)) { +- if (xx < epsilon) { +- r[0] = 0; +- r[1] = r[2] = ha; +- } else { +- double t = sqrt(xx); +- r[0] = Float(t * 3.14159265358979323846); +- r[1] = Float(xy / t * 3.14159265358979323846); +- r[2] = Float(xz / t * 3.14159265358979323846); +- } +- } else if (yy > zz) { +- if (yy < epsilon) { +- r[0] = r[2] = ha; +- r[1] = 0; +- } else { +- double t = sqrt(yy); +- r[0] = Float(xy / t * 3.14159265358979323846); +- r[1] = Float(t * 3.14159265358979323846); +- r[2] = Float(yz / t * 3.14159265358979323846); +- } +- } else { +- if (zz < epsilon) { +- r[0] = r[1] = ha; +- r[2] = 0; +- } else { +- double t = sqrt(zz); +- r[0] = Float(xz / t * 3.14159265358979323846); +- r[1] = Float(yz / t * 3.14159265358979323846); +- r[2] = Float(t * 3.14159265358979323846); +- } +- } +- } +- } else { +- a = acos(a); +- double b = 0.5 * a / sin(a); +- r[0] = Float(b * (m[2][1] - m[1][2])); +- r[1] = Float(b * (m[0][2] - m[2][0])); +- r[2] = Float(b * (m[1][0] - m[0][1])); +- } +-} +-template +-void UncompressRodriguesRotation(const Float r[3], Float m[]) { +- double a = sqrt(r[0] * r[0] + r[1] * r[1] + r[2] * r[2]); +- double ct = a == 0.0 ? 0.5f : (1.0f - cos(a)) / a / a; +- double st = a == 0.0 ? 1 : sin(a) / a; +- m[0] = Float(1.0 - (r[1] * r[1] + r[2] * r[2]) * ct); +- m[1] = Float(r[0] * r[1] * ct - r[2] * st); +- m[2] = Float(r[2] * r[0] * ct + r[1] * st); +- m[3] = Float(r[0] * r[1] * ct + r[2] * st); +- m[4] = Float(1.0f - (r[2] * r[2] + r[0] * r[0]) * ct); +- m[5] = Float(r[1] * r[2] * ct - r[0] * st); +- m[6] = Float(r[2] * r[0] * ct - r[1] * st); +- m[7] = Float(r[1] * r[2] * ct + r[0] * st); +- m[8] = Float(1.0 - (r[0] * r[0] + r[1] * r[1]) * ct); +-} +-template +-void UpdateCamera(int ncam, const avec& camera, const avec& delta, +- avec& new_camera) { +- const Float *c = &camera[0], *d = &delta[0]; +- Float *nc = &new_camera[0], m[9]; +- // f[1], t[3], r[3][3], d[1] +- for (int i = 0; i < ncam; ++i, c += 16, d += 8, nc += 16) { +- nc[0] = max(c[0] + d[0], ((Float)1e-10)); +- nc[1] = c[1] + d[1]; +- nc[2] = c[2] + d[2]; +- nc[3] = c[3] + d[3]; +- nc[13] = c[13] + d[7]; +- +- //////////////////////////////////////////////////// +- UncompressRodriguesRotation(d + 4, m); +- nc[4] = m[0] * c[4 + 0] + m[1] * c[4 + 3] + m[2] * c[4 + 6]; +- nc[5] = m[0] * c[4 + 1] + m[1] * c[4 + 4] + m[2] * c[4 + 7]; +- nc[6] = m[0] * c[4 + 2] + m[1] * c[4 + 5] + m[2] * c[4 + 8]; +- nc[7] = m[3] * c[4 + 0] + m[4] * c[4 + 3] + m[5] * c[4 + 6]; +- nc[8] = m[3] * c[4 + 1] + m[4] * c[4 + 4] + m[5] * c[4 + 7]; +- nc[9] = m[3] * c[4 + 2] + m[4] * c[4 + 5] + m[5] * c[4 + 8]; +- nc[10] = m[6] * c[4 + 0] + m[7] * c[4 + 3] + m[8] * c[4 + 6]; +- nc[11] = m[6] * c[4 + 1] + m[7] * c[4 + 4] + m[8] * c[4 + 7]; +- nc[12] = m[6] * c[4 + 2] + m[7] * c[4 + 5] + m[8] * c[4 + 8]; +- +- // Float temp[3]; +- // GetRodriguesRotation((Float (*)[3]) (nc + 4), temp); +- // UncompressRodriguesRotation(temp, nc + 4); +- nc[14] = c[14]; +- nc[15] = c[15]; +- } +-} +- +-template +-void UpdateCameraPoint(int ncam, const avec& camera, +- const avec& point, avec& delta, +- avec& new_camera, avec& new_point, +- int mode, int mt) { +- //////////////////////////// +- if (mode != 2) { +- UpdateCamera(ncam, camera, delta, new_camera); +- } +- ///////////////////////////// +- if (mode != 1) { +- avec dp; +- dp.set(delta.begin() + 8 * ncam, point.size()); +- ComputeSAXPY((Float)1.0, dp, point, new_point, mt); +- } +-} +- +-template +-void ComputeProjection(size_t nproj, const Float* camera, const Float* point, +- const Float* ms, const int* jmap, Float* pj, int radial, +- int mt); +- +-DEFINE_THREAD_DATA(ComputeProjection) +-size_t nproj; +-const Float *camera, *point, *ms; +-const int* jmap; +-Float* pj; +-int radial_distortion; +-BEGIN_THREAD_PROC(ComputeProjection) +-ComputeProjection(q->nproj, q->camera, q->point, q->ms, q->jmap, q->pj, +- q->radial_distortion, 0); +-END_THREAD_RPOC(ComputeProjection) +- +-template +-void ComputeProjection(size_t nproj, const Float* camera, const Float* point, +- const Float* ms, const int* jmap, Float* pj, int radial, +- int mt) { +- if (mt > 1 && nproj >= mt) { +- MYTHREAD threads[THREAD_NUM_MAX]; +- const size_t thread_num = std::min(mt, THREAD_NUM_MAX); +- for (size_t i = 0; i < thread_num; ++i) { +- size_t first = nproj * i / thread_num; +- size_t last_ = nproj * (i + 1) / thread_num; +- size_t last = std::min(last_, nproj); +- RUN_THREAD(ComputeProjection, threads[i], last - first, camera, point, +- ms + 2 * first, jmap + 2 * first, pj + 2 * first, radial); +- } +- WAIT_THREAD(threads, thread_num); +- +- } else { +- for (size_t i = 0; i < nproj; ++i, jmap += 2, ms += 2, pj += 2) { +- const Float* c = camera + jmap[0] * 16; +- const Float* m = point + jmap[1] * POINT_ALIGN; +- ///////////////////////////////////////////////////// +- Float p0 = c[4] * m[0] + c[5] * m[1] + c[6] * m[2] + c[1]; +- Float p1 = c[7] * m[0] + c[8] * m[1] + c[9] * m[2] + c[2]; +- Float p2 = c[10] * m[0] + c[11] * m[1] + c[12] * m[2] + c[3]; +- +- if (radial == 1) { +- Float rr = Float(1.0) + c[13] * (p0 * p0 + p1 * p1) / (p2 * p2); +- Float f_p2 = c[0] * rr / p2; +- pj[0] = ms[0] - p0 * f_p2; +- pj[1] = ms[1] - p1 * f_p2; +- } else if (radial == -1) { +- Float f_p2 = c[0] / p2; +- Float rd = Float(1.0) + c[13] * (ms[0] * ms[0] + ms[1] * ms[1]); +- pj[0] = ms[0] * rd - p0 * f_p2; +- pj[1] = ms[1] * rd - p1 * f_p2; +- } else { +- pj[0] = ms[0] - p0 * c[0] / p2; +- pj[1] = ms[1] - p1 * c[0] / p2; +- } +- } +- } +-} +- +-template +-void ComputeProjectionX(size_t nproj, const Float* camera, const Float* point, +- const Float* ms, const int* jmap, Float* pj, int radial, +- int mt); +- +-DEFINE_THREAD_DATA(ComputeProjectionX) +-size_t nproj; +-const Float *camera, *point, *ms; +-const int* jmap; +-Float* pj; +-int radial_distortion; +-BEGIN_THREAD_PROC(ComputeProjectionX) +-ComputeProjectionX(q->nproj, q->camera, q->point, q->ms, q->jmap, q->pj, +- q->radial_distortion, 0); +-END_THREAD_RPOC(ComputeProjectionX) +- +-template +-void ComputeProjectionX(size_t nproj, const Float* camera, const Float* point, +- const Float* ms, const int* jmap, Float* pj, int radial, +- int mt) { +- if (mt > 1 && nproj >= mt) { +- MYTHREAD threads[THREAD_NUM_MAX]; +- const size_t thread_num = std::min(mt, THREAD_NUM_MAX); +- for (size_t i = 0; i < thread_num; ++i) { +- size_t first = nproj * i / thread_num; +- size_t last_ = nproj * (i + 1) / thread_num; +- size_t last = std::min(last_, nproj); +- RUN_THREAD(ComputeProjectionX, threads[i], last - first, camera, point, +- ms + 2 * first, jmap + 2 * first, pj + 2 * first, radial); +- } +- WAIT_THREAD(threads, thread_num); +- } else { +- for (size_t i = 0; i < nproj; ++i, jmap += 2, ms += 2, pj += 2) { +- const Float* c = camera + jmap[0] * 16; +- const Float* m = point + jmap[1] * POINT_ALIGN; +- ///////////////////////////////////////////////////// +- Float p0 = c[4] * m[0] + c[5] * m[1] + c[6] * m[2] + c[1]; +- Float p1 = c[7] * m[0] + c[8] * m[1] + c[9] * m[2] + c[2]; +- Float p2 = c[10] * m[0] + c[11] * m[1] + c[12] * m[2] + c[3]; +- if (radial == 1) { +- Float rr = Float(1.0) + c[13] * (p0 * p0 + p1 * p1) / (p2 * p2); +- Float f_p2 = c[0] / p2; +- pj[0] = ms[0] / rr - p0 * f_p2; +- pj[1] = ms[1] / rr - p1 * f_p2; +- } else if (radial == -1) { +- Float rd = Float(1.0) + c[13] * (ms[0] * ms[0] + ms[1] * ms[1]); +- Float f_p2 = c[0] / p2 / rd; +- pj[0] = ms[0] - p0 * f_p2; +- pj[1] = ms[1] - p1 * f_p2; +- } else { +- pj[0] = ms[0] - p0 * c[0] / p2; +- pj[1] = ms[1] - p1 * c[0] / p2; +- } +- } +- } +-} +- +-template +-void ComputeProjectionQ(size_t nq, const Float* camera, const int* qmap, +- const Float* wq, Float* pj) { +- for (size_t i = 0; i < nq; ++i, qmap += 2, pj += 2, wq += 2) { +- const Float* c1 = camera + qmap[0] * 16; +- const Float* c2 = camera + qmap[1] * 16; +- pj[0] = -(c1[0] - c2[0]) * wq[0]; +- pj[1] = -(c1[13] - c2[13]) * wq[1]; +- } +-} +- +-template +-void ComputeJQX(size_t nq, const Float* x, const int* qmap, const Float* wq, +- const Float* sj, Float* jx) { +- if (sj) { +- for (size_t i = 0; i < nq; ++i, qmap += 2, jx += 2, wq += 2) { +- int idx1 = qmap[0] * 8, idx2 = qmap[1] * 8; +- const Float* x1 = x + idx1; +- const Float* x2 = x + idx2; +- const Float* sj1 = sj + idx1; +- const Float* sj2 = sj + idx2; +- jx[0] = (x1[0] * sj1[0] - x2[0] * sj2[0]) * wq[0]; +- jx[1] = (x1[7] * sj1[7] - x2[7] * sj2[7]) * wq[1]; +- } +- } else { +- for (size_t i = 0; i < nq; ++i, qmap += 2, jx += 2, wq += 2) { +- const Float* x1 = x + qmap[0] * 8; +- const Float* x2 = x + qmap[1] * 8; +- jx[0] = (x1[0] - x2[0]) * wq[0]; +- jx[1] = (x1[7] - x2[7]) * wq[1]; +- } +- } +-} +- +-template +-void ComputeJQtEC(size_t ncam, const Float* pe, const int* qlist, +- const Float* wq, const Float* sj, Float* v) { +- if (sj) { +- for (size_t i = 0; i < ncam; ++i, qlist += 2, wq += 2, v += 8, sj += 8) { +- int ip = qlist[0]; +- if (ip == -1) continue; +- int in = qlist[1]; +- const Float* e1 = pe + ip * 2; +- const Float* e2 = pe + in * 2; +- v[0] += wq[0] * sj[0] * (e1[0] - e2[0]); +- v[7] += wq[1] * sj[7] * (e1[1] - e2[1]); +- } +- } else { +- for (size_t i = 0; i < ncam; ++i, qlist += 2, wq += 2, v += 8) { +- int ip = qlist[0]; +- if (ip == -1) continue; +- int in = qlist[1]; +- const Float* e1 = pe + ip * 2; +- const Float* e2 = pe + in * 2; +- v[0] += wq[0] * (e1[0] - e2[0]); +- v[7] += wq[1] * (e1[1] - e2[1]); +- } +- } +-} +- +-template +-inline void JacobianOne(const Float* c, const Float* pt, const Float* ms, +- Float* jxc, Float* jyc, Float* jxp, Float* jyp, +- bool intrinsic_fixed, int radial_distortion) { +- const Float* r = c + 4; +- Float x0 = c[4] * pt[0] + c[5] * pt[1] + c[6] * pt[2]; +- Float y0 = c[7] * pt[0] + c[8] * pt[1] + c[9] * pt[2]; +- Float z0 = c[10] * pt[0] + c[11] * pt[1] + c[12] * pt[2]; +- Float p2 = (z0 + c[3]); +- Float f_p2 = c[0] / p2; +- Float p0_p2 = (x0 + c[1]) / p2; +- Float p1_p2 = (y0 + c[2]) / p2; +- +- if (radial_distortion == 1) { +- Float rr1 = c[13] * p0_p2 * p0_p2; +- Float rr2 = c[13] * p1_p2 * p1_p2; +- Float f_p2_x = Float(f_p2 * (1.0 + 3.0 * rr1 + rr2)); +- Float f_p2_y = Float(f_p2 * (1.0 + 3.0 * rr2 + rr1)); +- if (jxc) { +-#ifndef PBA_DISABLE_CONST_CAMERA +- if (c[15] != 0.0f) { +- jxc[0] = 0; +- jxc[1] = 0; +- jxc[2] = 0; +- jxc[3] = 0; +- jxc[4] = 0; +- jxc[5] = 0; +- jxc[6] = 0; +- jxc[7] = 0; +- jyc[0] = 0; +- jyc[1] = 0; +- jyc[2] = 0; +- jyc[3] = 0; +- jyc[4] = 0; +- jyc[5] = 0; +- jyc[6] = 0; +- jyc[7] = 0; +- } else +-#endif +- { +- Float jfc = intrinsic_fixed ? 0 : Float(1.0 + rr1 + rr2); +- Float ft_x_pn = +- intrinsic_fixed ? 0 : c[0] * (p0_p2 * p0_p2 + p1_p2 * p1_p2); +- ///////////////////////////////////////////////////// +- jxc[0] = p0_p2 * jfc; +- jxc[1] = f_p2_x; +- jxc[2] = 0; +- jxc[3] = -f_p2_x * p0_p2; +- jxc[4] = -f_p2_x * p0_p2 * y0; +- jxc[5] = f_p2_x * (z0 + x0 * p0_p2); +- jxc[6] = -f_p2_x * y0; +- jxc[7] = ft_x_pn * p0_p2; +- +- jyc[0] = p1_p2 * jfc; +- jyc[1] = 0; +- jyc[2] = f_p2_y; +- jyc[3] = -f_p2_y * p1_p2; +- jyc[4] = -f_p2_y * (z0 + y0 * p1_p2); +- jyc[5] = f_p2_y * x0 * p1_p2; +- jyc[6] = f_p2_y * x0; +- jyc[7] = ft_x_pn * p1_p2; +- } +- } +- +- /////////////////////////////////// +- if (jxp) { +- jxp[0] = f_p2_x * (r[0] - r[6] * p0_p2); +- jxp[1] = f_p2_x * (r[1] - r[7] * p0_p2); +- jxp[2] = f_p2_x * (r[2] - r[8] * p0_p2); +- jyp[0] = f_p2_y * (r[3] - r[6] * p1_p2); +- jyp[1] = f_p2_y * (r[4] - r[7] * p1_p2); +- jyp[2] = f_p2_y * (r[5] - r[8] * p1_p2); +-#ifdef POINT_DATA_ALIGN4 +- jxp[3] = jyp[3] = 0; +-#endif +- } +- } else { +- if (jxc) { +-#ifndef PBA_DISABLE_CONST_CAMERA +- if (c[15] != 0.0f) { +- jxc[0] = 0; +- jxc[1] = 0; +- jxc[2] = 0; +- jxc[3] = 0; +- jxc[4] = 0; +- jxc[5] = 0; +- jxc[6] = 0; +- jxc[7] = 0; +- jyc[0] = 0; +- jyc[1] = 0; +- jyc[2] = 0; +- jyc[3] = 0; +- jyc[4] = 0; +- jyc[5] = 0; +- jyc[6] = 0; +- jyc[7] = 0; +- } else +-#endif +- { +- jxc[0] = intrinsic_fixed ? 0 : p0_p2; +- jxc[1] = f_p2; +- jxc[2] = 0; +- jxc[3] = -f_p2 * p0_p2; +- jxc[4] = -f_p2 * p0_p2 * y0; +- jxc[5] = f_p2 * (z0 + x0 * p0_p2); +- jxc[6] = -f_p2 * y0; +- +- jyc[0] = intrinsic_fixed ? 0 : p1_p2; +- jyc[1] = 0; +- jyc[2] = f_p2; +- jyc[3] = -f_p2 * p1_p2; +- jyc[4] = -f_p2 * (z0 + y0 * p1_p2); +- jyc[5] = f_p2 * x0 * p1_p2; +- jyc[6] = f_p2 * x0; +- +- if (radial_distortion == -1 && !intrinsic_fixed) { +- Float msn = ms[0] * ms[0] + ms[1] * ms[1]; +- jxc[7] = -ms[0] * msn; +- jyc[7] = -ms[1] * msn; +- } else { +- jxc[7] = 0; +- jyc[7] = 0; +- } +- } +- } +- /////////////////////////////////// +- if (jxp) { +- jxp[0] = f_p2 * (r[0] - r[6] * p0_p2); +- jxp[1] = f_p2 * (r[1] - r[7] * p0_p2); +- jxp[2] = f_p2 * (r[2] - r[8] * p0_p2); +- jyp[0] = f_p2 * (r[3] - r[6] * p1_p2); +- jyp[1] = f_p2 * (r[4] - r[7] * p1_p2); +- jyp[2] = f_p2 * (r[5] - r[8] * p1_p2); +-#ifdef POINT_DATA_ALIGN4 +- jxp[3] = jyp[3] = 0; +-#endif +- } +- } +-} +- +-template +-void ComputeJacobian(size_t nproj, size_t ncam, const Float* camera, +- const Float* point, Float* jc, Float* jp, const int* jmap, +- const Float* sj, const Float* ms, const int* cmlist, +- bool intrinsic_fixed, int radial_distortion, bool shuffle, +- Float* jct, int mt = 2, int i0 = 0); +- +-DEFINE_THREAD_DATA(ComputeJacobian) +-size_t nproj, ncam; +-const Float *camera, *point; +-Float *jc, *jp; +-const int* jmap; +-const Float *sj, *ms; +-const int* cmlist; +-bool intrinsic_fixed; +-int radial_distortion; +-bool shuffle; +-Float* jct; +-int i0; +-BEGIN_THREAD_PROC(ComputeJacobian) +-ComputeJacobian(q->nproj, q->ncam, q->camera, q->point, q->jc, q->jp, q->jmap, +- q->sj, q->ms, q->cmlist, q->intrinsic_fixed, +- q->radial_distortion, q->shuffle, q->jct, 0, q->i0); +-END_THREAD_RPOC(ComputeJacobian) +- +-template +-void ComputeJacobian(size_t nproj, size_t ncam, const Float* camera, +- const Float* point, Float* jc, Float* jp, const int* jmap, +- const Float* sj, const Float* ms, const int* cmlist, +- bool intrinsic_fixed, int radial_distortion, bool shuffle, +- Float* jct, int mt, int i0) { +- if (mt > 1 && nproj >= mt) { +- MYTHREAD threads[THREAD_NUM_MAX]; +- const size_t thread_num = std::min(mt, THREAD_NUM_MAX); +- for (size_t i = 0; i < thread_num; ++i) { +- size_t first = nproj * i / thread_num; +- size_t last_ = nproj * (i + 1) / thread_num; +- size_t last = std::min(last_, nproj); +- RUN_THREAD(ComputeJacobian, threads[i], last, ncam, camera, point, jc, jp, +- jmap + 2 * first, sj, ms + 2 * first, cmlist + first, +- intrinsic_fixed, radial_distortion, shuffle, jct, first); +- } +- WAIT_THREAD(threads, thread_num); +- } else { +- const Float* sjc0 = sj; +- const Float* sjp0 = sj ? sj + ncam * 8 : NULL; +- +- for (size_t i = i0; i < nproj; ++i, jmap += 2, ms += 2, ++cmlist) { +- int cidx = jmap[0], pidx = jmap[1]; +- const Float *c = camera + cidx * 16, *pt = point + pidx * POINT_ALIGN; +- Float* jci = jc ? (jc + (shuffle ? cmlist[0] : i) * 16) : NULL; +- Float* jpi = jp ? (jp + i * POINT_ALIGN2) : NULL; +- +- ///////////////////////////////////////////////////// +- JacobianOne(c, pt, ms, jci, jci + 8, jpi, jpi + POINT_ALIGN, +- intrinsic_fixed, radial_distortion); +- +- /////////////////////////////////////////////////// +- if (sjc0) { +- // jacobian scaling +- if (jci) { +- ScaleJ8(jci, jci + 8, sjc0 + cidx * 8); +- } +- if (jpi) { +- const Float* sjp = sjp0 + pidx * POINT_ALIGN; +- for (int j = 0; j < 3; ++j) { +- jpi[j] *= sjp[j]; +- jpi[POINT_ALIGN + j] *= sjp[j]; +- } +- } +- } +- +- if (jct && jc) MemoryCopyB(jci, jci + 16, jct + cmlist[0] * 16); +- } +- } +-} +- +-template +-void ComputeDiagonalAddQ(size_t ncam, const Float* qw, Float* d, +- const Float* sj = NULL) { +- if (sj) { +- for (size_t i = 0; i < ncam; ++i, qw += 2, d += 8, sj += 8) { +- if (qw[0] == 0) continue; +- Float j1 = qw[0] * sj[0]; +- Float j2 = qw[1] * sj[7]; +- d[0] += (j1 * j1 * 2.0f); +- d[7] += (j2 * j2 * 2.0f); +- } +- } else { +- for (size_t i = 0; i < ncam; ++i, qw += 2, d += 8) { +- if (qw[0] == 0) continue; +- d[0] += (qw[0] * qw[0] * 2.0f); +- d[7] += (qw[1] * qw[1] * 2.0f); +- } +- } +-} +- +-/////////////////////////////////////// +-template +-void ComputeDiagonal(const avec& jcv, const vector& cmapv, +- const avec& jpv, const vector& pmapv, +- const vector& cmlistv, const Float* qw0, +- avec& jtjdi, bool jc_transpose, int radial) { +- // first camera part +- if (jcv.size() == 0 || jpv.size() == 0) return; // not gonna happen +- +- size_t ncam = cmapv.size() - 1, npts = pmapv.size() - 1; +- const int vn = radial ? 8 : 7; +- SetVectorZero(jtjdi); +- +- const int* cmap = &cmapv[0]; +- const int* pmap = &pmapv[0]; +- const int* cmlist = &cmlistv[0]; +- const Float* jc = &jcv[0]; +- const Float* jp = &jpv[0]; +- const Float* qw = qw0; +- Float* jji = &jtjdi[0]; +- +- ///////compute jc part +- for (size_t i = 0; i < ncam; ++i, jji += 8, ++cmap, qw += 2) { +- int idx1 = cmap[0], idx2 = cmap[1]; +- ////////////////////////////////////// +- for (int j = idx1; j < idx2; ++j) { +- int idx = jc_transpose ? j : cmlist[j]; +- const Float* pj = jc + idx * 16; +- /////////////////////////////////////////// +- for (int k = 0; k < vn; ++k) +- jji[k] += (pj[k] * pj[k] + pj[k + 8] * pj[k + 8]); +- } +- if (qw0 && qw[0] > 0) { +- jji[0] += (qw[0] * qw[0] * 2.0f); +- jji[7] += (qw[1] * qw[1] * 2.0f); +- } +- } +- +- for (size_t i = 0; i < npts; ++i, jji += POINT_ALIGN, ++pmap) { +- int idx1 = pmap[0], idx2 = pmap[1]; +- const Float* pj = jp + idx1 * POINT_ALIGN2; +- for (int j = idx1; j < idx2; ++j, pj += POINT_ALIGN2) { +- for (int k = 0; k < 3; ++k) +- jji[k] += (pj[k] * pj[k] + pj[k + POINT_ALIGN] * pj[k + POINT_ALIGN]); +- } +- } +- Float* it = jtjdi.begin(); +- for (; it < jtjdi.end(); ++it) { +- *it = (*it == 0) ? 0 : Float(1.0 / (*it)); +- } +-} +- +-template +-void InvertSymmetricMatrix(T a[n][m], T ai[n][m]) { +- for (int i = 0; i < n; ++i) { +- if (a[i][i] > 0) { +- a[i][i] = sqrt(a[i][i]); +- for (int j = i + 1; j < n; ++j) a[j][i] = a[j][i] / a[i][i]; +- for (int j = i + 1; j < n; ++j) +- for (int k = j; k < n; ++k) a[k][j] -= a[k][i] * a[j][i]; +- } +- } +- ///////////////////////////// +- // inv(L) +- for (int i = 0; i < n; ++i) { +- if (a[i][i] == 0) continue; +- a[i][i] = 1.0f / a[i][i]; +- } +- for (int i = 1; i < n; ++i) { +- if (a[i][i] == 0) continue; +- for (int j = 0; j < i; ++j) { +- T sum = 0; +- for (int k = j; k < i; ++k) sum += (a[i][k] * a[k][j]); +- a[i][j] = -sum * a[i][i]; +- } +- } +- ///////////////////////////// +- // inv(L)' * inv(L) +- for (int i = 0; i < n; ++i) { +- for (int j = i; j < n; ++j) { +- ai[i][j] = 0; +- for (int k = j; k < n; ++k) ai[i][j] += a[k][i] * a[k][j]; +- ai[j][i] = ai[i][j]; +- } +- } +-} +-template +-void InvertSymmetricMatrix(T* a, T* ai) { +- InvertSymmetricMatrix((T(*)[m])a, (T(*)[m])ai); +-} +- +-template +-void ComputeDiagonalBlockC(size_t ncam, float lambda1, float lambda2, +- const Float* jc, const int* cmap, const int* cmlist, +- Float* di, Float* bi, int vn, bool jc_transpose, +- bool use_jq, int mt); +- +-DEFINE_THREAD_DATA(ComputeDiagonalBlockC) +-size_t ncam; +-float lambda1, lambda2; +-const Float* jc; +-const int *cmap, *cmlist; +-Float *di, *bi; +-int vn; +-bool jc_transpose, use_jq; +-BEGIN_THREAD_PROC(ComputeDiagonalBlockC) +-ComputeDiagonalBlockC(q->ncam, q->lambda1, q->lambda2, q->jc, q->cmap, +- q->cmlist, q->di, q->bi, q->vn, q->jc_transpose, +- q->use_jq, 0); +-END_THREAD_RPOC(ComputeDiagonalBlockC) +- +-template +-void ComputeDiagonalBlockC(size_t ncam, float lambda1, float lambda2, +- const Float* jc, const int* cmap, const int* cmlist, +- Float* di, Float* bi, int vn, bool jc_transpose, +- bool use_jq, int mt) { +- const size_t bc = vn * 8; +- +- if (mt > 1 && ncam >= (size_t)mt) { +- MYTHREAD threads[THREAD_NUM_MAX]; +- const size_t thread_num = std::min(mt, THREAD_NUM_MAX); +- for (size_t i = 0; i < thread_num; ++i) { +- size_t first = ncam * i / thread_num; +- size_t last_ = ncam * (i + 1) / thread_num; +- size_t last = std::min(last_, ncam); +- RUN_THREAD(ComputeDiagonalBlockC, threads[i], (last - first), lambda1, +- lambda2, jc, cmap + first, cmlist, di + 8 * first, +- bi + bc * first, vn, jc_transpose, use_jq); +- } +- WAIT_THREAD(threads, thread_num); +- } else { +- Float bufv[64 + 8]; // size_t offset = ((size_t)bufv) & 0xf; +- // Float* pbuf = bufv + ((16 - offset) / sizeof(Float)); +- Float* pbuf = (Float*)ALIGN_PTR(bufv); +- +- ///////compute jc part +- for (size_t i = 0; i < ncam; ++i, ++cmap, bi += bc) { +- int idx1 = cmap[0], idx2 = cmap[1]; +- ////////////////////////////////////// +- if (idx1 == idx2) { +- SetVectorZero(bi, bi + bc); +- } else { +- SetVectorZero(pbuf, pbuf + 64); +- +- for (int j = idx1; j < idx2; ++j) { +- int idx = jc_transpose ? j : cmlist[j]; +- const Float* pj = jc + idx * 16; +- ///////////////////////////////// +- AddBlockJtJ(pj, pbuf, vn); +- AddBlockJtJ(pj + 8, pbuf, vn); +- } +- +- // change and copy the diagonal +- +- if (use_jq) { +- Float* pb = pbuf; +- for (int j = 0; j < 8; ++j, ++di, pb += 9) { +- Float temp; +- di[0] = temp = (di[0] + pb[0]); +- pb[0] = lambda2 * temp + lambda1; +- } +- } else { +- Float* pb = pbuf; +- for (int j = 0; j < 8; ++j, ++di, pb += 9) { +- *pb = lambda2 * ((*di) = (*pb)) + lambda1; +- } +- } +- +- // invert the matrix? +- if (vn == 8) +- InvertSymmetricMatrix(pbuf, bi); +- else +- InvertSymmetricMatrix(pbuf, bi); +- } +- } +- } +-} +- +-template +-void ComputeDiagonalBlockP(size_t npt, float lambda1, float lambda2, +- const Float* jp, const int* pmap, Float* di, +- Float* bi, int mt); +- +-DEFINE_THREAD_DATA(ComputeDiagonalBlockP) +-size_t npt; +-float lambda1, lambda2; +-const Float* jp; +-const int* pmap; +-Float *di, *bi; +-BEGIN_THREAD_PROC(ComputeDiagonalBlockP) +-ComputeDiagonalBlockP(q->npt, q->lambda1, q->lambda2, q->jp, q->pmap, q->di, +- q->bi, 0); +-END_THREAD_RPOC(ComputeDiagonalBlockP) +- +-template +-void ComputeDiagonalBlockP(size_t npt, float lambda1, float lambda2, +- const Float* jp, const int* pmap, Float* di, +- Float* bi, int mt) { +- if (mt > 1) { +- MYTHREAD threads[THREAD_NUM_MAX]; +- const size_t thread_num = std::min(mt, THREAD_NUM_MAX); +- for (size_t i = 0; i < thread_num; ++i) { +- size_t first = npt * i / thread_num; +- size_t last_ = npt * (i + 1) / thread_num; +- size_t last = std::min(last_, npt); +- RUN_THREAD(ComputeDiagonalBlockP, threads[i], (last - first), lambda1, +- lambda2, jp, pmap + first, di + POINT_ALIGN * first, +- bi + 6 * first); +- } +- WAIT_THREAD(threads, thread_num); +- } else { +- for (size_t i = 0; i < npt; ++i, ++pmap, di += POINT_ALIGN, bi += 6) { +- int idx1 = pmap[0], idx2 = pmap[1]; +- +- Float M00 = 0, M01 = 0, M02 = 0, M11 = 0, M12 = 0, M22 = 0; +- const Float *jxp = jp + idx1 * (POINT_ALIGN2), *jyp = jxp + POINT_ALIGN; +- for (int j = idx1; j < idx2; +- ++j, jxp += POINT_ALIGN2, jyp += POINT_ALIGN2) { +- M00 += (jxp[0] * jxp[0] + jyp[0] * jyp[0]); +- M01 += (jxp[0] * jxp[1] + jyp[0] * jyp[1]); +- M02 += (jxp[0] * jxp[2] + jyp[0] * jyp[2]); +- M11 += (jxp[1] * jxp[1] + jyp[1] * jyp[1]); +- M12 += (jxp[1] * jxp[2] + jyp[1] * jyp[2]); +- M22 += (jxp[2] * jxp[2] + jyp[2] * jyp[2]); +- } +- +- ///////////////////////////////// +- di[0] = M00; +- di[1] = M11; +- di[2] = M22; +- +- ///////////////////////////// +- M00 = M00 * lambda2 + lambda1; +- M11 = M11 * lambda2 + lambda1; +- M22 = M22 * lambda2 + lambda1; +- +- /////////////////////////////// +- Float det = (M00 * M11 - M01 * M01) * M22 + Float(2.0) * M01 * M12 * M02 - +- M02 * M02 * M11 - M12 * M12 * M00; +- if (det >= FLT_MAX || det <= FLT_MIN * 2.0f) { +- // SetVectorZero(bi, bi + 6); +- for (int j = 0; j < 6; ++j) bi[j] = 0; +- } else { +- bi[0] = (M11 * M22 - M12 * M12) / det; +- bi[1] = -(M01 * M22 - M12 * M02) / det; +- bi[2] = (M01 * M12 - M02 * M11) / det; +- bi[3] = (M00 * M22 - M02 * M02) / det; +- bi[4] = -(M00 * M12 - M01 * M02) / det; +- bi[5] = (M00 * M11 - M01 * M01) / det; +- } +- } +- } +-} +- +-template +-void ComputeDiagonalBlock(size_t ncam, size_t npts, float lambda, bool dampd, +- const Float* jc, const int* cmap, const Float* jp, +- const int* pmap, const int* cmlist, const Float* sj, +- const Float* wq, Float* diag, Float* blocks, +- int radial_distortion, bool jc_transpose, int mt1 = 2, +- int mt2 = 2, int mode = 0) { +- const int vn = radial_distortion ? 8 : 7; +- const size_t bc = vn * 8; +- float lambda1 = dampd ? 0.0f : lambda; +- float lambda2 = dampd ? (1.0f + lambda) : 1.0f; +- +- if (mode == 0) { +- const size_t bsz = bc * ncam + npts * 6; +- const size_t dsz = 8 * ncam + npts * POINT_ALIGN; +- bool use_jq = wq != NULL; +- /////////////////////////////////////////// +- SetVectorZero(blocks, blocks + bsz); +- SetVectorZero(diag, diag + dsz); +- +- //////////////////////////////// +- if (use_jq) ComputeDiagonalAddQ(ncam, wq, diag, sj); +- ComputeDiagonalBlockC(ncam, lambda1, lambda2, jc, cmap, cmlist, diag, +- blocks, vn, jc_transpose, use_jq, mt1); +- ComputeDiagonalBlockP(npts, lambda1, lambda2, jp, pmap, diag + 8 * ncam, +- blocks + bc * ncam, mt2); +- } else if (mode == 1) { +- const size_t bsz = bc * ncam; +- const size_t dsz = 8 * ncam; +- bool use_jq = wq != NULL; +- /////////////////////////////////////////// +- SetVectorZero(blocks, blocks + bsz); +- SetVectorZero(diag, diag + dsz); +- +- //////////////////////////////// +- if (use_jq) ComputeDiagonalAddQ(ncam, wq, diag, sj); +- ComputeDiagonalBlockC(ncam, lambda1, lambda2, jc, cmap, cmlist, diag, +- blocks, vn, jc_transpose, use_jq, mt1); +- } else { +- blocks += bc * ncam; +- diag += 8 * ncam; +- const size_t bsz = npts * 6; +- const size_t dsz = npts * POINT_ALIGN; +- /////////////////////////////////////////// +- SetVectorZero(blocks, blocks + bsz); +- SetVectorZero(diag, diag + dsz); +- +- //////////////////////////////// +- ComputeDiagonalBlockP(npts, lambda1, lambda2, jp, pmap, diag, blocks, mt2); +- } +-} +- +-template +-void ComputeDiagonalBlock_(float lambda, bool dampd, const avec& camerav, +- const avec& pointv, const avec& meas, +- const vector& jmapv, const avec& sjv, +- avec& qwv, avec& diag, +- avec& blocks, bool intrinsic_fixed, +- int radial_distortion, int mode = 0) { +- const int vn = radial_distortion ? 8 : 7; +- const size_t szbc = vn * 8; +- size_t ncam = camerav.size() / 16; +- size_t npts = pointv.size() / POINT_ALIGN; +- size_t sz_jcd = ncam * 8; +- size_t sz_jcb = ncam * szbc; +- avec blockpv(blocks.size()); +- SetVectorZero(blockpv); +- SetVectorZero(diag); +- ////////////////////////////////////////////////////// +- float lambda1 = dampd ? 0.0f : lambda; +- float lambda2 = dampd ? (1.0f + lambda) : 1.0f; +- +- Float jbufv[24 + 8]; // size_t offset = ((size_t) jbufv) & 0xf; +- // Float* jxc = jbufv + ((16 - offset) / sizeof(Float)); +- Float* jxc = (Float*)ALIGN_PTR(jbufv); +- Float *jyc = jxc + 8, *jxp = jxc + 16, *jyp = jxc + 20; +- +- ////////////////////////////// +- const int* jmap = &jmapv[0]; +- const Float* camera = &camerav[0]; +- const Float* point = &pointv[0]; +- const Float* ms = &meas[0]; +- const Float* sjc0 = sjv.size() ? &sjv[0] : NULL; +- const Float* sjp0 = sjv.size() ? &sjv[sz_jcd] : NULL; +- ////////////////////////////////////////////// +- Float *blockpc = &blockpv[0], *blockpp = &blockpv[sz_jcb]; +- Float *bo = blockpc, *bi = &blocks[0], *di = &diag[0]; +- +- ///////////////////////////////////////////////////////// +- // diagonal blocks +- for (size_t i = 0; i < jmapv.size(); i += 2, jmap += 2, ms += 2) { +- int cidx = jmap[0], pidx = jmap[1]; +- const Float *c = camera + cidx * 16, *pt = point + pidx * POINT_ALIGN; +- ///////////////////////////////////////////////////////// +- JacobianOne(c, pt, ms, jxc, jyc, jxp, jyp, intrinsic_fixed, +- radial_distortion); +- +- /////////////////////////////////////////////////////////// +- if (mode != 2) { +- if (sjc0) { +- const Float* sjc = sjc0 + cidx * 8; +- ScaleJ8(jxc, jyc, sjc); +- } +- ///////////////////////////////////////// +- Float* bc = blockpc + cidx * szbc; +- AddBlockJtJ(jxc, bc, vn); +- AddBlockJtJ(jyc, bc, vn); +- } +- +- if (mode != 1) { +- if (sjp0) { +- const Float* sjp = sjp0 + pidx * POINT_ALIGN; +- jxp[0] *= sjp[0]; +- jxp[1] *= sjp[1]; +- jxp[2] *= sjp[2]; +- jyp[0] *= sjp[0]; +- jyp[1] *= sjp[1]; +- jyp[2] *= sjp[2]; +- } +- +- /////////////////////////////////////////// +- Float* bp = blockpp + pidx * 6; +- bp[0] += (jxp[0] * jxp[0] + jyp[0] * jyp[0]); +- bp[1] += (jxp[0] * jxp[1] + jyp[0] * jyp[1]); +- bp[2] += (jxp[0] * jxp[2] + jyp[0] * jyp[2]); +- bp[3] += (jxp[1] * jxp[1] + jyp[1] * jyp[1]); +- bp[4] += (jxp[1] * jxp[2] + jyp[1] * jyp[2]); +- bp[5] += (jxp[2] * jxp[2] + jyp[2] * jyp[2]); +- } +- } +- +- /// invert the camera part +- if (mode != 2) { +- ///////////////////////////////////////// +- const Float* qw = qwv.begin(); +- if (qw) { +- for (size_t i = 0; i < ncam; ++i, qw += 2) { +- if (qw[0] == 0) continue; +- Float* bc = blockpc + i * szbc; +- if (sjc0) { +- const Float* sjc = sjc0 + i * 8; +- Float j1 = sjc[0] * qw[0]; +- Float j2 = sjc[7] * qw[1]; +- bc[0] += (j1 * j1 * 2.0f); +- if (radial_distortion) bc[63] += (j2 * j2 * 2.0f); +- } else { +- const Float* sjc = sjc0 + i * 8; +- bc[0] += (qw[0] * qw[0] * 2.0f); +- if (radial_distortion) bc[63] += (qw[1] * qw[1] * 2.0f); +- } +- } +- } +- +- for (size_t i = 0; i < ncam; ++i, bo += szbc, bi += szbc, di += 8) { +- Float *bp = bo, *dip = di; +- for (int j = 0; j < vn; ++j, ++dip, bp += 9) { +- dip[0] = bp[0]; +- bp[0] = lambda2 * bp[0] + lambda1; +- } +- +- // invert the matrix? +- if (radial_distortion) +- InvertSymmetricMatrix(bo, bi); +- else +- InvertSymmetricMatrix(bo, bi); +- } +- } else { +- bo += szbc * ncam; +- bi += szbc * ncam; +- di += 8 * ncam; +- } +- +- /////////////////////////////////////////// +- // inverting the point part +- if (mode != 1) { +- for (size_t i = 0; i < npts; ++i, bo += 6, bi += 6, di += POINT_ALIGN) { +- Float &M00 = bo[0], &M01 = bo[1], &M02 = bo[2]; +- Float &M11 = bo[3], &M12 = bo[4], &M22 = bo[5]; +- di[0] = M00; +- di[1] = M11; +- di[2] = M22; +- +- ///////////////////////////// +- M00 = M00 * lambda2 + lambda1; +- M11 = M11 * lambda2 + lambda1; +- M22 = M22 * lambda2 + lambda1; +- +- /////////////////////////////// +- Float det = (M00 * M11 - M01 * M01) * M22 + Float(2.0) * M01 * M12 * M02 - +- M02 * M02 * M11 - M12 * M12 * M00; +- if (det >= FLT_MAX || det <= FLT_MIN * 2.0f) { +- for (int j = 0; j < 6; ++j) bi[j] = 0; +- } else { +- bi[0] = (M11 * M22 - M12 * M12) / det; +- bi[1] = -(M01 * M22 - M12 * M02) / det; +- bi[2] = (M01 * M12 - M02 * M11) / det; +- bi[3] = (M00 * M22 - M02 * M02) / det; +- bi[4] = -(M00 * M12 - M01 * M02) / det; +- bi[5] = (M00 * M11 - M01 * M01) / det; +- } +- } +- } +-} +- +-template +-void MultiplyBlockConditionerC(int ncam, const Float* bi, const Float* x, +- Float* vx, int vn, int mt = 0); +- +-DEFINE_THREAD_DATA(MultiplyBlockConditionerC) +-int ncam; +-const Float *bi, *x; +-Float* vx; +-int vn; +-BEGIN_THREAD_PROC(MultiplyBlockConditionerC) +-MultiplyBlockConditionerC(q->ncam, q->bi, q->x, q->vx, q->vn, 0); +-END_THREAD_RPOC(MultiplyBlockConditionerC) +- +-template +-void MultiplyBlockConditionerC(int ncam, const Float* bi, const Float* x, +- Float* vx, int vn, int mt) { +- if (mt > 1 && ncam >= mt) { +- const size_t bc = vn * 8; +- MYTHREAD threads[THREAD_NUM_MAX]; +- const int thread_num = std::min(mt, THREAD_NUM_MAX); +- for (int i = 0; i < thread_num; ++i) { +- int first = ncam * i / thread_num; +- int last_ = ncam * (i + 1) / thread_num; +- int last = std::min(last_, ncam); +- RUN_THREAD(MultiplyBlockConditionerC, threads[i], (last - first), +- bi + first * bc, x + 8 * first, vx + 8 * first, vn); +- } +- WAIT_THREAD(threads, thread_num); +- } else { +- for (int i = 0; i < ncam; ++i, x += 8, vx += 8) { +- Float* vxc = vx; +- for (int j = 0; j < vn; ++j, bi += 8, ++vxc) *vxc = DotProduct8(bi, x); +- } +- } +-} +- +-template +-void MultiplyBlockConditionerP(int npoint, const Float* bi, const Float* x, +- Float* vx, int mt = 0); +- +-DEFINE_THREAD_DATA(MultiplyBlockConditionerP) +-int npoint; +-const Float *bi, *x; +-Float* vx; +-BEGIN_THREAD_PROC(MultiplyBlockConditionerP) +-MultiplyBlockConditionerP(q->npoint, q->bi, q->x, q->vx, 0); +-END_THREAD_RPOC(MultiplyBlockConditionerP) +- +-template +-void MultiplyBlockConditionerP(int npoint, const Float* bi, const Float* x, +- Float* vx, int mt) { +- if (mt > 1 && npoint >= mt) { +- MYTHREAD threads[THREAD_NUM_MAX]; +- const int thread_num = std::min(mt, THREAD_NUM_MAX); +- for (int i = 0; i < thread_num; ++i) { +- int first = npoint * i / thread_num; +- int last_ = npoint * (i + 1) / thread_num; +- int last = std::min(last_, npoint); +- RUN_THREAD(MultiplyBlockConditionerP, threads[i], (last - first), +- bi + first * 6, x + POINT_ALIGN * first, +- vx + POINT_ALIGN * first); +- } +- WAIT_THREAD(threads, thread_num); +- } else { +- for (int i = 0; i < npoint; +- ++i, bi += 6, x += POINT_ALIGN, vx += POINT_ALIGN) { +- vx[0] = (bi[0] * x[0] + bi[1] * x[1] + bi[2] * x[2]); +- vx[1] = (bi[1] * x[0] + bi[3] * x[1] + bi[4] * x[2]); +- vx[2] = (bi[2] * x[0] + bi[4] * x[1] + bi[5] * x[2]); +- } +- } +-} +- +-template +-void MultiplyBlockConditioner(int ncam, int npoint, const Float* blocksv, +- const Float* vec, Float* resultv, int radial, +- int mode, int mt1, int mt2) { +- const int vn = radial ? 8 : 7; +- if (mode != 2) +- MultiplyBlockConditionerC(ncam, blocksv, vec, resultv, vn, mt1); +- if (mt2 == 0) mt2 = AUTO_MT_NUM(npoint * 24); +- if (mode != 1) +- MultiplyBlockConditionerP(npoint, blocksv + (vn * 8 * ncam), vec + ncam * 8, +- resultv + 8 * ncam, mt2); +-} +- +-template +-void ComputeJX(size_t nproj, size_t ncam, const Float* x, const Float* jc, +- const Float* jp, const int* jmap, Float* jx, int mode, +- int mt = 2); +- +-DEFINE_THREAD_DATA(ComputeJX) +-size_t nproj, ncam; +-const Float *xc, *jc, *jp; +-const int* jmap; +-Float* jx; +-int mode; +-BEGIN_THREAD_PROC(ComputeJX) +-ComputeJX(q->nproj, q->ncam, q->xc, q->jc, q->jp, q->jmap, q->jx, q->mode, 0); +-END_THREAD_RPOC(ComputeJX) +- +-template +-void ComputeJX(size_t nproj, size_t ncam, const Float* x, const Float* jc, +- const Float* jp, const int* jmap, Float* jx, int mode, int mt) { +- if (mt > 1 && nproj >= mt) { +- MYTHREAD threads[THREAD_NUM_MAX]; +- const size_t thread_num = std::min(mt, THREAD_NUM_MAX); +- for (size_t i = 0; i < thread_num; ++i) { +- size_t first = nproj * i / thread_num; +- size_t last_ = nproj * (i + 1) / thread_num; +- size_t last = std::min(last_, nproj); +- RUN_THREAD(ComputeJX, threads[i], (last - first), ncam, x, +- jc + 16 * first, jp + POINT_ALIGN2 * first, jmap + first * 2, +- jx + first * 2, mode); +- } +- WAIT_THREAD(threads, thread_num); +- } else if (mode == 0) { +- const Float *pxc = x, *pxp = pxc + ncam * 8; +- // clock_t tp = clock(); double s1 = 0, s2 = 0; +- for (size_t i = 0; i < nproj; +- ++i, jmap += 2, jc += 16, jp += POINT_ALIGN2, jx += 2) { +- ComputeTwoJX(jc, jp, pxc + jmap[0] * 8, pxp + jmap[1] * POINT_ALIGN, jx); +- } +- } else if (mode == 1) { +- const Float* pxc = x; +- // clock_t tp = clock(); double s1 = 0, s2 = 0; +- for (size_t i = 0; i < nproj; +- ++i, jmap += 2, jc += 16, jp += POINT_ALIGN2, jx += 2) { +- const Float* xc = pxc + jmap[0] * 8; +- jx[0] = DotProduct8(jc, xc); +- jx[1] = DotProduct8(jc + 8, xc); +- } +- } else if (mode == 2) { +- const Float* pxp = x + ncam * 8; +- // clock_t tp = clock(); double s1 = 0, s2 = 0; +- for (size_t i = 0; i < nproj; +- ++i, jmap += 2, jc += 16, jp += POINT_ALIGN2, jx += 2) { +- const Float* xp = pxp + jmap[1] * POINT_ALIGN; +- jx[0] = (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]); +- jx[1] = (jp[3] * xp[0] + jp[4] * xp[1] + jp[5] * xp[2]); +- } +- } +-} +- +-template +-void ComputeJX_(size_t nproj, size_t ncam, const Float* x, Float* jx, +- const Float* camera, const Float* point, const Float* ms, +- const Float* sj, const int* jmap, bool intrinsic_fixed, +- int radial_distortion, int mode, int mt = 16); +- +-DEFINE_THREAD_DATA(ComputeJX_) +-size_t nproj, ncam; +-const Float* x; +-Float* jx; +-const Float *camera, *point, *ms, *sj; +-const int* jmap; +-bool intrinsic_fixed; +-int radial_distortion; +-int mode; +-BEGIN_THREAD_PROC(ComputeJX_) +-ComputeJX_(q->nproj, q->ncam, q->x, q->jx, q->camera, q->point, q->ms, q->sj, +- q->jmap, q->intrinsic_fixed, q->radial_distortion, q->mode, 0); +-END_THREAD_RPOC(ComputeJX_) +- +-template +-void ComputeJX_(size_t nproj, size_t ncam, const Float* x, Float* jx, +- const Float* camera, const Float* point, const Float* ms, +- const Float* sj, const int* jmap, bool intrinsic_fixed, +- int radial_distortion, int mode, int mt) { +- if (mt > 1 && nproj >= mt) { +- MYTHREAD threads[THREAD_NUM_MAX]; +- const size_t thread_num = std::min(mt, THREAD_NUM_MAX); +- for (size_t i = 0; i < thread_num; ++i) { +- size_t first = nproj * i / thread_num; +- size_t last_ = nproj * (i + 1) / thread_num; +- size_t last = std::min(last_, nproj); +- RUN_THREAD(ComputeJX_, threads[i], (last - first), ncam, x, +- jx + first * 2, camera, point, ms + 2 * first, sj, +- jmap + first * 2, intrinsic_fixed, radial_distortion, mode); +- } +- WAIT_THREAD(threads, thread_num); +- } else if (mode == 0) { +- Float jcv[24 + 8]; // size_t offset = ((size_t) jcv) & 0xf; +- // Float* jc = jcv + (16 - offset) / sizeof(Float), *jp = jc + 16; +- Float *jc = (Float *)ALIGN_PTR(jcv), *jp = jc + 16; +- //////////////////////////////////////// +- const Float* sjc = sj; +- const Float* sjp = sjc ? (sjc + ncam * 8) : NULL; +- const Float *xc0 = x, *xp0 = x + ncam * 8; +- +- ///////////////////////////////// +- for (size_t i = 0; i < nproj; ++i, ms += 2, jmap += 2, jx += 2) { +- const int cidx = jmap[0], pidx = jmap[1]; +- const Float *c = camera + cidx * 16, *pt = point + pidx * POINT_ALIGN; +- ///////////////////////////////////////////////////// +- JacobianOne(c, pt, ms, jc, jc + 8, jp, jp + POINT_ALIGN, intrinsic_fixed, +- radial_distortion); +- if (sjc) { +- // jacobian scaling +- ScaleJ8(jc, jc + 8, sjc + cidx * 8); +- const Float* sjpi = sjp + pidx * POINT_ALIGN; +- for (int j = 0; j < 3; ++j) { +- jp[j] *= sjpi[j]; +- jp[POINT_ALIGN + j] *= sjpi[j]; +- } +- } +- //////////////////////////////////// +- ComputeTwoJX(jc, jp, xc0 + cidx * 8, xp0 + pidx * POINT_ALIGN, jx); +- } +- } else if (mode == 1) { +- Float jcv[24 + 8]; // size_t offset = ((size_t) jcv) & 0xf; +- // Float* jc = jcv + (16 - offset) / sizeof(Float); +- Float* jc = (Float*)ALIGN_PTR(jcv); +- +- //////////////////////////////////////// +- const Float *sjc = sj, *xc0 = x; +- +- ///////////////////////////////// +- for (size_t i = 0; i < nproj; ++i, ms += 2, jmap += 2, jx += 2) { +- const int cidx = jmap[0], pidx = jmap[1]; +- const Float *c = camera + cidx * 16, *pt = point + pidx * POINT_ALIGN; +- ///////////////////////////////////////////////////// +- JacobianOne(c, pt, ms, jc, jc + 8, (Float*)NULL, (Float*)NULL, +- intrinsic_fixed, radial_distortion); +- if (sjc) ScaleJ8(jc, jc + 8, sjc + cidx * 8); +- const Float* xc = xc0 + cidx * 8; +- jx[0] = DotProduct8(jc, xc); +- jx[1] = DotProduct8(jc + 8, xc); +- } +- } else if (mode == 2) { +- Float jp[8]; +- +- //////////////////////////////////////// +- const Float* sjp = sj ? (sj + ncam * 8) : NULL; +- const Float* xp0 = x + ncam * 8; +- +- ///////////////////////////////// +- for (size_t i = 0; i < nproj; ++i, ms += 2, jmap += 2, jx += 2) { +- const int cidx = jmap[0], pidx = jmap[1]; +- const Float *c = camera + cidx * 16, *pt = point + pidx * POINT_ALIGN; +- ///////////////////////////////////////////////////// +- JacobianOne(c, pt, ms, (Float*)NULL, (Float*)NULL, jp, jp + POINT_ALIGN, +- intrinsic_fixed, radial_distortion); +- +- const Float* xp = xp0 + pidx * POINT_ALIGN; +- if (sjp) { +- const Float* s = sjp + pidx * POINT_ALIGN; +- jx[0] = (jp[0] * xp[0] * s[0] + jp[1] * xp[1] * s[1] + +- jp[2] * xp[2] * s[2]); +- jx[1] = (jp[3] * xp[0] * s[0] + jp[4] * xp[1] * s[1] + +- jp[5] * xp[2] * s[2]); +- } else { +- jx[0] = (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]); +- jx[1] = (jp[3] * xp[0] + jp[4] * xp[1] + jp[5] * xp[2]); +- } +- } +- } +-} +- +-template +-void ComputeJtEC(size_t ncam, const Float* pe, const Float* jc, const int* cmap, +- const int* cmlist, Float* v, bool jc_transpose, int mt); +- +-DEFINE_THREAD_DATA(ComputeJtEC) +-size_t ncam; +-const Float *pe, *jc; +-const int *cmap, *cmlist; +-Float* v; +-bool jc_transpose; +-BEGIN_THREAD_PROC(ComputeJtEC) +-ComputeJtEC(q->ncam, q->pe, q->jc, q->cmap, q->cmlist, q->v, q->jc_transpose, +- 0); +-END_THREAD_RPOC(ComputeJtEC) +- +-template +-void ComputeJtEC(size_t ncam, const Float* pe, const Float* jc, const int* cmap, +- const int* cmlist, Float* v, bool jc_transpose, int mt) { +- if (mt > 1 && ncam >= mt) { +- MYTHREAD threads[THREAD_NUM_MAX]; // if(ncam < mt) mt = ncam; +- const size_t thread_num = std::min(mt, THREAD_NUM_MAX); +- for (size_t i = 0; i < thread_num; ++i) { +- size_t first = ncam * i / thread_num; +- size_t last_ = ncam * (i + 1) / thread_num; +- size_t last = std::min(last_, ncam); +- RUN_THREAD(ComputeJtEC, threads[i], (last - first), pe, jc, cmap + first, +- cmlist, v + 8 * first, jc_transpose); +- } +- WAIT_THREAD(threads, thread_num); +- } else { +- ///////////////////////////////// +- for (size_t i = 0; i < ncam; ++i, ++cmap, v += 8) { +- int idx1 = cmap[0], idx2 = cmap[1]; +- for (int j = idx1; j < idx2; ++j) { +- int edx = cmlist[j]; +- const Float* pj = jc + ((jc_transpose ? j : edx) * 16); +- const Float* e = pe + edx * 2; +- ////////////////////////////// +- AddScaledVec8(e[0], pj, v); +- AddScaledVec8(e[1], pj + 8, v); +- } +- } +- } +-} +- +-template +-void ComputeJtEP(size_t npt, const Float* pe, const Float* jp, const int* pmap, +- Float* v, int mt); +- +-DEFINE_THREAD_DATA(ComputeJtEP) +-size_t npt; +-const Float *pe, *jp; +-const int* pmap; +-Float* v; +-BEGIN_THREAD_PROC(ComputeJtEP) +-ComputeJtEP(q->npt, q->pe, q->jp, q->pmap, q->v, 0); +-END_THREAD_RPOC(ComputeJtEP) +- +-template +-void ComputeJtEP(size_t npt, const Float* pe, const Float* jp, const int* pmap, +- Float* v, int mt) { +- if (mt > 1 && npt >= mt) { +- MYTHREAD threads[THREAD_NUM_MAX]; +- const size_t thread_num = std::min(mt, THREAD_NUM_MAX); +- for (size_t i = 0; i < thread_num; ++i) { +- size_t first = npt * i / thread_num; +- size_t last_ = npt * (i + 1) / thread_num; +- size_t last = std::min(last_, npt); +- RUN_THREAD(ComputeJtEP, threads[i], (last - first), pe, jp, pmap + first, +- v + POINT_ALIGN * first); +- } +- WAIT_THREAD(threads, thread_num); +- } else { +- for (size_t i = 0; i < npt; ++i, ++pmap, v += POINT_ALIGN) { +- int idx1 = pmap[0], idx2 = pmap[1]; +- const Float* pj = jp + idx1 * POINT_ALIGN2; +- const Float* e = pe + idx1 * 2; +- Float temp[3] = {0, 0, 0}; +- for (int j = idx1; j < idx2; ++j, pj += POINT_ALIGN2, e += 2) { +- temp[0] += (e[0] * pj[0] + e[1] * pj[POINT_ALIGN]); +- temp[1] += (e[0] * pj[1] + e[1] * pj[POINT_ALIGN + 1]); +- temp[2] += (e[0] * pj[2] + e[1] * pj[POINT_ALIGN + 2]); +- } +- v[0] = temp[0]; +- v[1] = temp[1]; +- v[2] = temp[2]; +- } +- } +-} +- +-template +-void ComputeJtE(size_t ncam, size_t npt, const Float* pe, const Float* jc, +- const int* cmap, const int* cmlist, const Float* jp, +- const int* pmap, Float* v, bool jc_transpose, int mode, int mt1, +- int mt2) { +- if (mode != 2) { +- SetVectorZero(v, v + ncam * 8); +- ComputeJtEC(ncam, pe, jc, cmap, cmlist, v, jc_transpose, mt1); +- } +- if (mode != 1) { +- ComputeJtEP(npt, pe, jp, pmap, v + 8 * ncam, mt2); +- } +-} +- +-template +-void ComputeJtEC_(size_t ncam, const Float* ee, Float* jte, const Float* c, +- const Float* point, const Float* ms, const int* jmap, +- const int* cmap, const int* cmlist, bool intrinsic_fixed, +- int radial_distortion, int mt); +- +-DEFINE_THREAD_DATA(ComputeJtEC_) +-size_t ncam; +-const Float* ee; +-Float* jte; +-const Float *c, *point, *ms; +-const int *jmap, *cmap, *cmlist; +-bool intrinsic_fixed; +-int radial_distortion; +-BEGIN_THREAD_PROC(ComputeJtEC_) +-ComputeJtEC_(q->ncam, q->ee, q->jte, q->c, q->point, q->ms, q->jmap, q->cmap, +- q->cmlist, q->intrinsic_fixed, q->radial_distortion, 0); +-END_THREAD_RPOC(ComputeJtEC_) +- +-template +-void ComputeJtEC_(size_t ncam, const Float* ee, Float* jte, const Float* c, +- const Float* point, const Float* ms, const int* jmap, +- const int* cmap, const int* cmlist, bool intrinsic_fixed, +- int radial_distortion, int mt) { +- if (mt > 1 && ncam >= mt) { +- MYTHREAD threads[THREAD_NUM_MAX]; +- // if(ncam < mt) mt = ncam; +- const size_t thread_num = std::min(mt, THREAD_NUM_MAX); +- for (size_t i = 0; i < thread_num; ++i) { +- size_t first = ncam * i / thread_num; +- size_t last_ = ncam * (i + 1) / thread_num; +- size_t last = std::min(last_, ncam); +- RUN_THREAD(ComputeJtEC_, threads[i], (last - first), ee, jte + 8 * first, +- c + first * 16, point, ms, jmap, cmap + first, cmlist, +- intrinsic_fixed, radial_distortion); +- } +- WAIT_THREAD(threads, thread_num); +- +- } else { +- ///////////////////////////////// +- Float jcv[16 + 8]; // size_t offset = ((size_t) jcv) & 0xf; +- // Float* jcx = jcv + ((16 - offset) / sizeof(Float)), * jcy = jcx + 8; +- Float *jcx = (Float *)ALIGN_PTR(jcv), *jcy = jcx + 8; +- +- for (size_t i = 0; i < ncam; ++i, ++cmap, jte += 8, c += 16) { +- int idx1 = cmap[0], idx2 = cmap[1]; +- +- for (int j = idx1; j < idx2; ++j) { +- int index = cmlist[j]; +- const Float* pt = point + jmap[2 * index + 1] * POINT_ALIGN; +- const Float* e = ee + index * 2; +- +- JacobianOne(c, pt, ms + index * 2, jcx, jcy, (Float*)NULL, (Float*)NULL, +- intrinsic_fixed, radial_distortion); +- +- ////////////////////////////// +- AddScaledVec8(e[0], jcx, jte); +- AddScaledVec8(e[1], jcy, jte); +- } +- } +- } +-} +- +-template +-void ComputeJtE_(size_t nproj, size_t ncam, size_t npt, const Float* ee, +- Float* jte, const Float* camera, const Float* point, +- const Float* ms, const int* jmap, const int* cmap, +- const int* cmlist, const int* pmap, const Float* jp, +- bool intrinsic_fixed, int radial_distortion, int mode, +- int mt) { +- if (mode != 2) { +- SetVectorZero(jte, jte + ncam * 8); +- ComputeJtEC_(ncam, ee, jte, camera, point, ms, jmap, cmap, cmlist, +- intrinsic_fixed, radial_distortion, mt); +- } +- if (mode != 1) { +- ComputeJtEP(npt, ee, jp, pmap, jte + 8 * ncam, mt); +- } +-} +- +-template +-void ComputeJtE_(size_t nproj, size_t ncam, size_t npt, const Float* ee, +- Float* jte, const Float* camera, const Float* point, +- const Float* ms, const int* jmap, bool intrinsic_fixed, +- int radial_distortion, int mode) { +- SetVectorZero(jte, jte + (ncam * 8 + npt * POINT_ALIGN)); +- Float jcv[24 + 8]; // size_t offset = ((size_t) jcv) & 0xf; +- // Float* jc = jcv + (16 - offset) / sizeof(Float), *pj = jc + 16; +- Float *jc = (Float *)ALIGN_PTR(jcv), *pj = jc + 16; +- +- Float *vc0 = jte, *vp0 = jte + ncam * 8; +- +- for (size_t i = 0; i < nproj; ++i, jmap += 2, ms += 2, ee += 2) { +- int cidx = jmap[0], pidx = jmap[1]; +- const Float *c = camera + cidx * 16, *pt = point + pidx * POINT_ALIGN; +- +- if (mode == 0) { +- ///////////////////////////////////////////////////// +- JacobianOne(c, pt, ms, jc, jc + 8, pj, pj + POINT_ALIGN, intrinsic_fixed, +- radial_distortion); +- +- //////////////////////////////////////////// +- Float *vc = vc0 + cidx * 8, *vp = vp0 + pidx * POINT_ALIGN; +- AddScaledVec8(ee[0], jc, vc); +- AddScaledVec8(ee[1], jc + 8, vc); +- vp[0] += (ee[0] * pj[0] + ee[1] * pj[POINT_ALIGN]); +- vp[1] += (ee[0] * pj[1] + ee[1] * pj[POINT_ALIGN + 1]); +- vp[2] += (ee[0] * pj[2] + ee[1] * pj[POINT_ALIGN + 2]); +- } else if (mode == 1) { +- ///////////////////////////////////////////////////// +- JacobianOne(c, pt, ms, jc, jc + 8, (Float*)NULL, (Float*)NULL, +- intrinsic_fixed, radial_distortion); +- +- //////////////////////////////////////////// +- Float* vc = vc0 + cidx * 8; +- AddScaledVec8(ee[0], jc, vc); +- AddScaledVec8(ee[1], jc + 8, vc); +- } else { +- ///////////////////////////////////////////////////// +- JacobianOne(c, pt, ms, (Float*)NULL, (Float*)NULL, pj, pj + POINT_ALIGN, +- intrinsic_fixed, radial_distortion); +- +- //////////////////////////////////////////// +- Float* vp = vp0 + pidx * POINT_ALIGN; +- vp[0] += (ee[0] * pj[0] + ee[1] * pj[POINT_ALIGN]); +- vp[1] += (ee[0] * pj[1] + ee[1] * pj[POINT_ALIGN + 1]); +- vp[2] += (ee[0] * pj[2] + ee[1] * pj[POINT_ALIGN + 2]); +- } +- } +-} +-}; +- +-using namespace ProgramCPU; +- +-template +-SparseBundleCPU::SparseBundleCPU(const int num_threads) +- : ParallelBA(PBA_INVALID_DEVICE), +- _num_camera(0), +- _num_point(0), +- _num_imgpt(0), +- _num_imgpt_q(0), +- _camera_data(NULL), +- _point_data(NULL), +- _imgpt_data(NULL), +- _camera_idx(NULL), +- _point_idx(NULL), +- _projection_sse(0) { +- __cpu_data_precision = sizeof(Float); +- if (num_threads <= 0) { +- __num_cpu_cores = FindProcessorCoreNum(); +- } else { +- __num_cpu_cores = num_threads; +- } +- if (__verbose_level) +- std::cout << "CPU " << (__cpu_data_precision == 4 ? "single" : "double") +- << "-precision solver; " << __num_cpu_cores << " cores" +-#ifdef CPUPBA_USE_AVX +- << " (AVX)" +-#endif +- << ".\n"; +- // the following configuration are totally based my personal experience +- // on two computers.. you should adjust them according to your system. +- // try run driver filename -profile --float to see how speed varies +- //////////////////////////////////////// +- __num_cpu_thread[FUNC_JX] = __num_cpu_cores; +- __num_cpu_thread[FUNC_JX_] = __num_cpu_cores; +- __num_cpu_thread[FUNC_JTE_] = __num_cpu_cores; +- __num_cpu_thread[FUNC_JJ_JCO_JCT_JP] = __num_cpu_cores; +- __num_cpu_thread[FUNC_JJ_JCO_JP] = __num_cpu_cores; +- __num_cpu_thread[FUNC_JJ_JCT_JP] = __num_cpu_cores; +- __num_cpu_thread[FUNC_JJ_JP] = __num_cpu_cores; +- __num_cpu_thread[FUNC_PJ] = __num_cpu_cores; +- __num_cpu_thread[FUNC_BCC_JCO] = __num_cpu_cores; +- __num_cpu_thread[FUNC_BCC_JCT] = __num_cpu_cores; +- __num_cpu_thread[FUNC_BCP] = __num_cpu_cores; +- +- ////this behavious is different between CPU and GPU +- __multiply_jx_usenoj = false; +- +- /////////////////////////////////////////////////////////////////////////////// +- // To get the best performance, you should ajust the number of threads +- // Linux and Windows may also have different thread launching overhead. +- +- ////////////////////////////////////////////////////////////// +- __num_cpu_thread[FUNC_JTEC_JCT] = __num_cpu_cores * 2; +- __num_cpu_thread[FUNC_JTEC_JCO] = __num_cpu_cores * 2; +- __num_cpu_thread[FUNC_JTEP] = __num_cpu_cores; +- +- /////////// +- __num_cpu_thread[FUNC_MPC] = +- 1; // single thread always faster with my experience +- +- // see the AUTO_MT_NUM marcro for definition +- __num_cpu_thread[FUNC_MPP] = 0; // automatically chosen according to size +- __num_cpu_thread[FUNC_VS] = 0; // automatically chosen according to size +- __num_cpu_thread[FUNC_VV] = 0; // automatically chosen accodring to size +-} +- +-template +-void SparseBundleCPU::SetCameraData(size_t ncam, CameraT* cams) { +- if (sizeof(CameraT) != 16 * sizeof(float)) return; // never gonna happen...? +- _num_camera = (int)ncam; +- _camera_data = cams; +- _focal_mask = NULL; +-} +- +-template +-void SparseBundleCPU::SetFocalMask(const int* fmask, float weight) { +- _focal_mask = fmask; +- _weight_q = weight; +-} +- +-template +-void SparseBundleCPU::SetPointData(size_t npoint, Point3D* pts) { +- _num_point = (int)npoint; +- _point_data = (float*)pts; +-} +- +-template +-void SparseBundleCPU::SetProjection(size_t nproj, const Point2D* imgpts, +- const int* point_idx, +- const int* cam_idx) { +- _num_imgpt = (int)nproj; +- _imgpt_data = (float*)imgpts; +- _camera_idx = cam_idx; +- _point_idx = point_idx; +-} +- +-template +-float SparseBundleCPU::GetMeanSquaredError() { +- return float(_projection_sse / +- (_num_imgpt * __focal_scaling * __focal_scaling)); +-} +- +-template +-int SparseBundleCPU::RunBundleAdjustment() { +- ResetBundleStatistics(); +- BundleAdjustment(); +- if (__num_lm_success > 0) +- SaveBundleStatistics(_num_camera, _num_point, _num_imgpt); +- if (__num_lm_success > 0) PrintBundleStatistics(); +- ResetTemporarySetting(); +- return __num_lm_success; +-} +- +-template +-int SparseBundleCPU::ValidateInputData() { +- if (_camera_data == NULL) return STATUS_CAMERA_MISSING; +- if (_point_data == NULL) return STATUS_POINT_MISSING; +- if (_imgpt_data == NULL) return STATUS_MEASURMENT_MISSING; +- if (_camera_idx == NULL || _point_idx == NULL) +- return STATUS_PROJECTION_MISSING; +- return STATUS_SUCCESS; +-} +- +-template +-int SparseBundleCPU::InitializeBundle() { +- ///////////////////////////////////////////////////// +- TimerBA timer(this, TIMER_GPU_ALLOCATION); +- InitializeStorageForSFM(); +- InitializeStorageForCG(); +- +- if (__debug_pba) DumpCooJacobian(); +- +- return STATUS_SUCCESS; +-} +- +-template +-int SparseBundleCPU::GetParameterLength() { +- return _num_camera * 8 + POINT_ALIGN * _num_point; +-} +- +-template +-void SparseBundleCPU::BundleAdjustment() { +- if (ValidateInputData() != STATUS_SUCCESS) return; +- +- //////////////////////// +- TimerBA timer(this, TIMER_OVERALL); +- +- NormalizeData(); +- if (InitializeBundle() != STATUS_SUCCESS) { +- // failed to allocate gpu storage +- } else if (__profile_pba) { +- // profiling some stuff +- RunProfileSteps(); +- } else { +- // real optimization +- AdjustBundleAdjsutmentMode(); +- NonlinearOptimizeLM(); +- TransferDataToHost(); +- } +- DenormalizeData(); +-} +- +-template +-void SparseBundleCPU::NormalizeData() { +- TimerBA timer(this, TIMER_PREPROCESSING); +- NormalizeDataD(); +- NormalizeDataF(); +-} +- +-template +-void SparseBundleCPU::TransferDataToHost() { +- TimerBA timer(this, TIMER_GPU_DOWNLOAD); +- std::copy(_cuCameraData.begin(), _cuCameraData.end(), ((float*)_camera_data)); +-#ifdef POINT_DATA_ALIGN4 +- std::copy(_cuPointData.begin(), _cuPointData.end(), _point_data); +-#else +- for (size_t i = 0, j = 0; i < _cuPointData.size(); j++) { +- _point_data[j++] = (float)_cuPointData[i++]; +- _point_data[j++] = (float)_cuPointData[i++]; +- _point_data[j++] = (float)_cuPointData[i++]; +- } +-#endif +-} +- +-#define ALLOCATE_REQUIRED_DATA(NAME, num, channels) \ +- { \ +- NAME.resize((num) * (channels)); \ +- total_sz += NAME.size() * sizeof(Float); \ +- } +-#define ALLOCATE_OPTIONAL_DATA(NAME, num, channels, option) \ +- if (option) ALLOCATE_REQUIRED_DATA(NAME, num, channels) else { \ +- NAME.resize(0); \ +- } +-////////////////////////////////////////////// +-template +-bool SparseBundleCPU::InitializeStorageForSFM() { +- size_t total_sz = 0; +- ////////////////////////////////////////////////// +- ProcessIndexCameraQ(_cuCameraQMap, _cuCameraQList); +- total_sz += ((_cuCameraQMap.size() + _cuCameraQList.size()) * sizeof(int) / +- 1024 / 1024); +- +- /////////////////////////////////////////////////////////////////// +- ALLOCATE_REQUIRED_DATA(_cuPointData, _num_point, POINT_ALIGN); // 4n +- ALLOCATE_REQUIRED_DATA(_cuCameraData, _num_camera, 16); // 16m +- ALLOCATE_REQUIRED_DATA(_cuCameraDataEX, _num_camera, 16); // 16m +- +- //////////////////////////////////////////////////////////////// +- ALLOCATE_REQUIRED_DATA(_cuCameraMeasurementMap, _num_camera + 1, 1); // m +- ALLOCATE_REQUIRED_DATA(_cuCameraMeasurementList, _num_imgpt, 1); // k +- ALLOCATE_REQUIRED_DATA(_cuPointMeasurementMap, _num_point + 1, 1); // n +- ALLOCATE_REQUIRED_DATA(_cuProjectionMap, _num_imgpt, 2); // 2k +- ALLOCATE_REQUIRED_DATA(_cuImageProj, _num_imgpt + _num_imgpt_q, 2); // 2k +- ALLOCATE_REQUIRED_DATA(_cuPointDataEX, _num_point, POINT_ALIGN); // 4n +- ALLOCATE_REQUIRED_DATA(_cuMeasurements, _num_imgpt, 2); // 2k +- ALLOCATE_REQUIRED_DATA(_cuCameraQMapW, _num_imgpt_q, 2); +- ALLOCATE_REQUIRED_DATA(_cuCameraQListW, (_num_imgpt_q > 0 ? _num_camera : 0), +- 2); +- +- ALLOCATE_OPTIONAL_DATA(_cuJacobianPoint, _num_imgpt * 2, POINT_ALIGN, +- !__no_jacobian_store); // 8k +- ALLOCATE_OPTIONAL_DATA(_cuJacobianCameraT, _num_imgpt * 2, 8, +- !__no_jacobian_store && __jc_store_transpose); // 16k +- ALLOCATE_OPTIONAL_DATA(_cuJacobianCamera, _num_imgpt * 2, 8, +- !__no_jacobian_store && __jc_store_original); // 16k +- ALLOCATE_OPTIONAL_DATA(_cuCameraMeasurementListT, _num_imgpt, 1, +- __jc_store_transpose); // k +- +- ////////////////////////////////////////// +- BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION); +- ////mapping from camera to measuremnts +- vector& cpi = _cuCameraMeasurementMap; +- cpi.resize(_num_camera + 1); +- vector& cpidx = _cuCameraMeasurementList; +- cpidx.resize(_num_imgpt); +- vector cpnum(_num_camera, 0); +- cpi[0] = 0; +- for (int i = 0; i < _num_imgpt; ++i) cpnum[_camera_idx[i]]++; +- for (int i = 1; i <= _num_camera; ++i) cpi[i] = cpi[i - 1] + cpnum[i - 1]; +- /////////////////////////////////////////////////////// +- vector cptidx = cpi; +- for (int i = 0; i < _num_imgpt; ++i) cpidx[cptidx[_camera_idx[i]]++] = i; +- +- /////////////////////////////////////////////////////////// +- if (_cuCameraMeasurementListT.size()) { +- vector& ridx = _cuCameraMeasurementListT; +- ridx.resize(_num_imgpt); +- for (int i = 0; i < _num_imgpt; ++i) ridx[cpidx[i]] = i; +- } +- +- //////////////////////////////////////// +- /////constaraint weights. +- if (_num_imgpt_q > 0) +- ProcessWeightCameraQ(cpnum, _cuCameraQMap, _cuCameraQMapW.begin(), +- _cuCameraQListW.begin()); +- +- /////////////////////////////////////////////////////////////////////////////// +- std::copy((float*)_camera_data, ((float*)_camera_data) + _cuCameraData.size(), +- _cuCameraData.begin()); +- +-#ifdef POINT_DATA_ALIGN4 +- std::copy(_point_data, _point_data + _cuPointData.size(), +- _cuPointData.begin()); +-#else +- for (size_t i = 0, j = 0; i < _cuPointData.size(); j++) { +- _cuPointData[i++] = _point_data[j++]; +- _cuPointData[i++] = _point_data[j++]; +- _cuPointData[i++] = _point_data[j++]; +- } +-#endif +- +- //////////////////////////////////////////// +- ///////mapping from point to measurment +- vector& ppi = _cuPointMeasurementMap; +- ppi.resize(_num_point + 1); +- for (int i = 0, last_point = -1; i < _num_imgpt; ++i) { +- int pt = _point_idx[i]; +- while (last_point < pt) ppi[++last_point] = i; +- } +- ppi[_num_point] = _num_imgpt; +- +- //////////projection map +- vector& pmp = _cuProjectionMap; +- pmp.resize(_num_imgpt * 2); +- for (int i = 0; i < _num_imgpt; ++i) { +- int* imp = &pmp[i * 2]; +- imp[0] = _camera_idx[i]; +- imp[1] = _point_idx[i]; +- } +- BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION); +- ////////////////////////////////////////////////////////////// +- +- __memory_usage = total_sz; +- if (__verbose_level > 1) +- std::cout << "Memory for Motion/Structure/Jacobian:\t" +- << (total_sz / 1024 / 1024) << "MB\n"; +- +- return true; +-} +- +-template +-bool SparseBundleCPU::ProcessIndexCameraQ(vector& qmap, +- vector& qlist) { +- /////////////////////////////////// +- qlist.resize(0); +- qmap.resize(0); +- _num_imgpt_q = 0; +- +- if (_camera_idx == NULL) return true; +- if (_point_idx == NULL) return true; +- if (_focal_mask == NULL) return true; +- if (_num_camera == 0) return true; +- if (_weight_q <= 0) return true; +- +- /////////////////////////////////////// +- +- int error = 0; +- vector temp(_num_camera * 2, -1); +- +- for (int i = 0; i < _num_camera; ++i) { +- int iq = _focal_mask[i]; +- if (iq > i) { +- error = 1; +- break; +- } +- if (iq < 0) continue; +- if (iq == i) continue; +- int ip = temp[2 * iq]; +- // float ratio = _camera_data[i].f / _camera_data[iq].f; +- // if(ratio < 0.01 || ratio > 100) +- //{ +- // std::cout << "Warning: constaraints on largely different camreas\n"; +- // continue; +- //}else +- if (_focal_mask[iq] != iq) { +- error = 1; +- break; +- } else if (ip == -1) { +- temp[2 * iq] = i; +- temp[2 * iq + 1] = i; +- temp[2 * i] = iq; +- temp[2 * i + 1] = iq; +- } else { +- // maintain double-linked list +- temp[2 * i] = ip; +- temp[2 * i + 1] = iq; +- temp[2 * ip + 1] = i; +- temp[2 * iq] = i; +- } +- } +- +- if (error) { +- std::cout << "Error: incorrect constraints\n"; +- _focal_mask = NULL; +- return false; +- } +- +- //////////////////////////////////////// +- qlist.resize(_num_camera * 2, -1); +- for (int i = 0; i < _num_camera; ++i) { +- int inext = temp[2 * i + 1]; +- if (inext == -1) continue; +- qlist[2 * i] = _num_imgpt_q; +- qlist[2 * inext + 1] = _num_imgpt_q; +- qmap.push_back(i); +- qmap.push_back(inext); +- _num_imgpt_q++; +- } +- return true; +-} +- +-template +-void SparseBundleCPU::ProcessWeightCameraQ(vector& cpnum, +- vector& qmap, +- Float* qmapw, Float* qlistw) { +- // set average focal length and average radial distortion +- vector qpnum(_num_camera, 0), qcnum(_num_camera, 0); +- vector fs(_num_camera, 0), rs(_num_camera, 0); +- +- for (int i = 0; i < _num_camera; ++i) { +- int qi = _focal_mask[i]; +- if (qi == -1) continue; +- // float ratio = _camera_data[i].f / _camera_data[qi].f; +- // if(ratio < 0.01 || ratio > 100) continue; +- fs[qi] += _camera_data[i].f; +- rs[qi] += _camera_data[i].radial; +- qpnum[qi] += cpnum[i]; +- qcnum[qi] += 1.0f; +- } +- +- // this seems not really matter..they will converge anyway +- for (int i = 0; i < _num_camera; ++i) { +- int qi = _focal_mask[i]; +- if (qi == -1) continue; +- // float ratio = _camera_data[i].f / _camera_data[qi].f; +- // if(ratio < 0.01 || ratio > 100) continue; +- _camera_data[i].f = fs[qi] / qcnum[qi]; +- _camera_data[i].radial = rs[qi] / qcnum[qi]; +- } /**/ +- +- ///////////////////////////////////////// +- std::fill(qlistw, qlistw + _num_camera * 2, 0); +- +- for (int i = 0; i < _num_imgpt_q; ++i) { +- int cidx = qmap[i * 2], qi = _focal_mask[cidx]; +- Float wi = sqrt(qpnum[qi] / qcnum[qi]) * _weight_q; +- Float wr = (__use_radial_distortion ? wi * _camera_data[qi].f : 0.0); +- qmapw[i * 2] = wi; +- qmapw[i * 2 + 1] = wr; +- qlistw[cidx * 2] = wi; +- qlistw[cidx * 2 + 1] = wr; +- } +-} +- +-///////////////////////////////////////////////// +-template +-bool SparseBundleCPU::InitializeStorageForCG() { +- size_t total_sz = 0; +- int plen = GetParameterLength(); // q = 8m + 3n +- +- //////////////////////////////////////////// 6q +- ALLOCATE_REQUIRED_DATA(_cuVectorJtE, plen, 1); +- ALLOCATE_REQUIRED_DATA(_cuVectorXK, plen, 1); +- ALLOCATE_REQUIRED_DATA(_cuVectorJJ, plen, 1); +- ALLOCATE_REQUIRED_DATA(_cuVectorZK, plen, 1); +- ALLOCATE_REQUIRED_DATA(_cuVectorPK, plen, 1); +- ALLOCATE_REQUIRED_DATA(_cuVectorRK, plen, 1); +- +- /////////////////////////////////////////// +- unsigned int cblock_len = (__use_radial_distortion ? 64 : 56); +- ALLOCATE_REQUIRED_DATA(_cuBlockPC, _num_camera * cblock_len + 6 * _num_point, +- 1); // 64m + 12n +- ALLOCATE_REQUIRED_DATA(_cuVectorJX, _num_imgpt + _num_imgpt_q, 2); // 2k +- ALLOCATE_OPTIONAL_DATA(_cuVectorSJ, plen, 1, __jacobian_normalize); +- +- ///////////////////////////////////////// +- __memory_usage += total_sz; +- if (__verbose_level > 1) +- std::cout << "Memory for Conjugate Gradient Solver:\t" +- << (total_sz / 1024 / 1024) << "MB\n"; +- return true; +-} +- +-/////////////////////////////////////////////////// +-template +-void SparseBundleCPU::PrepareJacobianNormalization() { +- if (!_cuVectorSJ.size()) return; +- +- if ((__jc_store_transpose || __jc_store_original) && +- _cuJacobianPoint.size() && !__bundle_current_mode) { +- VectorF null; +- null.swap(_cuVectorSJ); +- EvaluateJacobians(); +- null.swap(_cuVectorSJ); +- ComputeDiagonal(_cuVectorSJ); +- ComputeSQRT(_cuVectorSJ); +- } else { +- VectorF null; +- null.swap(_cuVectorSJ); +- EvaluateJacobians(); +- ComputeBlockPC(0, true); +- null.swap(_cuVectorSJ); +- _cuVectorJJ.swap(_cuVectorSJ); +- ComputeRSQRT(_cuVectorSJ); +- } +-} +- +-template +-void SparseBundleCPU::EvaluateJacobians() { +- if (__no_jacobian_store) return; +- if (__bundle_current_mode == BUNDLE_ONLY_MOTION && !__jc_store_original && +- !__jc_store_transpose) +- return; +- +- ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JJ, true); +- +- if (__jc_store_original || !__jc_store_transpose) { +- int fid = __jc_store_original +- ? (__jc_store_transpose ? FUNC_JJ_JCO_JCT_JP : FUNC_JJ_JCO_JP) +- : FUNC_JJ_JP; +- ComputeJacobian( +- _num_imgpt, _num_camera, _cuCameraData.begin(), _cuPointData.begin(), +- _cuJacobianCamera.begin(), _cuJacobianPoint.begin(), +- &_cuProjectionMap.front(), _cuVectorSJ.begin(), _cuMeasurements.begin(), +- __jc_store_transpose ? &_cuCameraMeasurementListT.front() : NULL, +- __fixed_intrinsics, __use_radial_distortion, false, +- _cuJacobianCameraT.begin(), __num_cpu_thread[fid]); +- } else { +- ComputeJacobian(_num_imgpt, _num_camera, _cuCameraData.begin(), +- _cuPointData.begin(), _cuJacobianCameraT.begin(), +- _cuJacobianPoint.begin(), &_cuProjectionMap.front(), +- _cuVectorSJ.begin(), _cuMeasurements.begin(), +- &_cuCameraMeasurementListT.front(), __fixed_intrinsics, +- __use_radial_distortion, true, ((Float*)0), +- __num_cpu_thread[FUNC_JJ_JCT_JP]); +- } +- ++__num_jacobian_eval; +-} +- +-template +-void SparseBundleCPU::ComputeJtE(VectorF& E, VectorF& JtE, int mode) { +- ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JTE, true); +- if (mode == 0) mode = __bundle_current_mode; +- +- if (__no_jacobian_store || (!__jc_store_original && !__jc_store_transpose)) { +- if (_cuJacobianPoint.size()) { +- ProgramCPU::ComputeJtE_( +- _num_imgpt, _num_camera, _num_point, E.begin(), JtE.begin(), +- _cuCameraData.begin(), _cuPointData.begin(), _cuMeasurements.begin(), +- &_cuProjectionMap.front(), &_cuCameraMeasurementMap.front(), +- &_cuCameraMeasurementList.front(), &_cuPointMeasurementMap.front(), +- _cuJacobianPoint.begin(), __fixed_intrinsics, __use_radial_distortion, +- mode, __num_cpu_thread[FUNC_JTE_]); +- +- if (_cuVectorSJ.size() && mode != 2) +- ProgramCPU::ComputeVXY(JtE, _cuVectorSJ, JtE, _num_camera * 8); +- } else { +- ProgramCPU::ComputeJtE_(_num_imgpt, _num_camera, _num_point, E.begin(), +- JtE.begin(), _cuCameraData.begin(), +- _cuPointData.begin(), _cuMeasurements.begin(), +- &_cuProjectionMap.front(), __fixed_intrinsics, +- __use_radial_distortion, mode); +- +- ////////////////////////////////////////////////////////// +- // if(_cuVectorSJ.size()) ProgramCPU::ComputeVXY(JtE, _cuVectorSJ, JtE); +- if (!_cuVectorSJ.size()) { +- } else if (mode == 2) +- ComputeVXY(JtE, _cuVectorSJ, JtE, _num_point * POINT_ALIGN, +- _num_camera * 8); +- else if (mode == 1) +- ComputeVXY(JtE, _cuVectorSJ, JtE, _num_camera * 8); +- else +- ComputeVXY(JtE, _cuVectorSJ, JtE); +- } +- } else if (__jc_store_transpose) { +- ProgramCPU::ComputeJtE( +- _num_camera, _num_point, E.begin(), _cuJacobianCameraT.begin(), +- &_cuCameraMeasurementMap.front(), &_cuCameraMeasurementList.front(), +- _cuJacobianPoint.begin(), &_cuPointMeasurementMap.front(), JtE.begin(), +- true, mode, __num_cpu_thread[FUNC_JTEC_JCT], +- __num_cpu_thread[FUNC_JTEP]); +- } else { +- ProgramCPU::ComputeJtE( +- _num_camera, _num_point, E.begin(), _cuJacobianCamera.begin(), +- &_cuCameraMeasurementMap.front(), &_cuCameraMeasurementList.front(), +- _cuJacobianPoint.begin(), &_cuPointMeasurementMap.front(), JtE.begin(), +- false, mode, __num_cpu_thread[FUNC_JTEC_JCO], +- __num_cpu_thread[FUNC_JTEP]); +- } +- +- if (mode != 2 && _num_imgpt_q > 0) { +- ProgramCPU::ComputeJQtEC(_num_camera, E.begin() + 2 * _num_imgpt, +- &_cuCameraQList.front(), _cuCameraQListW.begin(), +- _cuVectorSJ.begin(), JtE.begin()); +- } +-} +- +-template +-void SparseBundleCPU::SaveBundleRecord(int iter, float res, +- float damping, float& g_norm, +- float& g_inf) { +- // do not really compute if parameter not specified... +- // for large dataset, it never converges.. +- g_inf = __lm_check_gradient ? float(ComputeVectorMax(_cuVectorJtE)) : 0; +- g_norm = +- __save_gradient_norm ? float(ComputeVectorNorm(_cuVectorJtE)) : g_inf; +- ConfigBA::SaveBundleRecord(iter, res, damping, g_norm, g_inf); +-} +- +-template +-float SparseBundleCPU::EvaluateProjection(VectorF& cam, VectorF& point, +- VectorF& proj) { +- ++__num_projection_eval; +- ConfigBA::TimerBA timer(this, TIMER_FUNCTION_PJ, true); +- ComputeProjection(_num_imgpt, cam.begin(), point.begin(), +- _cuMeasurements.begin(), &_cuProjectionMap.front(), +- proj.begin(), __use_radial_distortion, +- __num_cpu_thread[FUNC_PJ]); +- if (_num_imgpt_q > 0) +- ComputeProjectionQ(_num_imgpt_q, cam.begin(), &_cuCameraQMap.front(), +- _cuCameraQMapW.begin(), proj.begin() + 2 * _num_imgpt); +- return (float)ComputeVectorNorm(proj, __num_cpu_thread[FUNC_VS]); +-} +- +-template +-float SparseBundleCPU::EvaluateProjectionX(VectorF& cam, VectorF& point, +- VectorF& proj) { +- ++__num_projection_eval; +- ConfigBA::TimerBA timer(this, TIMER_FUNCTION_PJ, true); +- ComputeProjectionX(_num_imgpt, cam.begin(), point.begin(), +- _cuMeasurements.begin(), &_cuProjectionMap.front(), +- proj.begin(), __use_radial_distortion, +- __num_cpu_thread[FUNC_PJ]); +- if (_num_imgpt_q > 0) +- ComputeProjectionQ(_num_imgpt_q, cam.begin(), &_cuCameraQMap.front(), +- _cuCameraQMapW.begin(), proj.begin() + 2 * _num_imgpt); +- return (float)ComputeVectorNorm(proj, __num_cpu_thread[FUNC_VS]); +-} +- +-template +-void SparseBundleCPU::ComputeJX(VectorF& X, VectorF& JX, int mode) { +- ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JX, true); +- if (__no_jacobian_store || (__multiply_jx_usenoj && mode != 2) || +- !__jc_store_original) { +- ProgramCPU::ComputeJX_( +- _num_imgpt, _num_camera, X.begin(), JX.begin(), _cuCameraData.begin(), +- _cuPointData.begin(), _cuMeasurements.begin(), _cuVectorSJ.begin(), +- &_cuProjectionMap.front(), __fixed_intrinsics, __use_radial_distortion, +- mode, __num_cpu_thread[FUNC_JX_]); +- } else { +- ProgramCPU::ComputeJX(_num_imgpt, _num_camera, X.begin(), +- _cuJacobianCamera.begin(), _cuJacobianPoint.begin(), +- &_cuProjectionMap.front(), JX.begin(), mode, +- __num_cpu_thread[FUNC_JX]); +- } +- +- if (_num_imgpt_q > 0 && mode != 2) { +- ProgramCPU::ComputeJQX(_num_imgpt_q, X.begin(), &_cuCameraQMap.front(), +- _cuCameraQMapW.begin(), _cuVectorSJ.begin(), +- JX.begin() + 2 * _num_imgpt); +- } +-} +- +-template +-void SparseBundleCPU::ComputeBlockPC(float lambda, bool dampd) { +- ConfigBA::TimerBA timer(this, TIMER_FUNCTION_BC, true); +- +- if (__no_jacobian_store || (!__jc_store_original && !__jc_store_transpose && +- __bundle_current_mode != 2)) { +- ComputeDiagonalBlock_( +- lambda, dampd, _cuCameraData, _cuPointData, _cuMeasurements, +- _cuProjectionMap, _cuVectorSJ, _cuCameraQListW, _cuVectorJJ, _cuBlockPC, +- __fixed_intrinsics, __use_radial_distortion, __bundle_current_mode); +- } else if (__jc_store_transpose) { +- ComputeDiagonalBlock( +- _num_camera, _num_point, lambda, dampd, _cuJacobianCameraT.begin(), +- &_cuCameraMeasurementMap.front(), _cuJacobianPoint.begin(), +- &_cuPointMeasurementMap.front(), &_cuCameraMeasurementList.front(), +- _cuVectorSJ.begin(), _cuCameraQListW.begin(), _cuVectorJJ.begin(), +- _cuBlockPC.begin(), __use_radial_distortion, true, +- __num_cpu_thread[FUNC_BCC_JCT], __num_cpu_thread[FUNC_BCP], +- __bundle_current_mode); +- } else { +- ComputeDiagonalBlock( +- _num_camera, _num_point, lambda, dampd, _cuJacobianCamera.begin(), +- &_cuCameraMeasurementMap.front(), _cuJacobianPoint.begin(), +- &_cuPointMeasurementMap.front(), &_cuCameraMeasurementList.front(), +- _cuVectorSJ.begin(), _cuCameraQListW.begin(), _cuVectorJJ.begin(), +- _cuBlockPC.begin(), __use_radial_distortion, false, +- __num_cpu_thread[FUNC_BCC_JCO], __num_cpu_thread[FUNC_BCP], +- __bundle_current_mode); +- } +-} +- +-template +-void SparseBundleCPU::ApplyBlockPC(VectorF& v, VectorF& pv, int mode) { +- ConfigBA::TimerBA timer(this, TIMER_FUNCTION_MP, true); +- MultiplyBlockConditioner(_num_camera, _num_point, _cuBlockPC.begin(), +- v.begin(), pv.begin(), __use_radial_distortion, mode, +- __num_cpu_thread[FUNC_MPC], +- __num_cpu_thread[FUNC_MPP]); +-} +- +-template +-void SparseBundleCPU::ComputeDiagonal(VectorF& JJ) { +- ConfigBA::TimerBA timer(this, TIMER_FUNCTION_DD, true); +- if (__no_jacobian_store) { +- } else if (__jc_store_transpose) { +- ProgramCPU::ComputeDiagonal( +- _cuJacobianCameraT, _cuCameraMeasurementMap, _cuJacobianPoint, +- _cuPointMeasurementMap, _cuCameraMeasurementList, +- _cuCameraQListW.begin(), JJ, true, __use_radial_distortion); +- } else if (__jc_store_original) { +- ProgramCPU::ComputeDiagonal( +- _cuJacobianCamera, _cuCameraMeasurementMap, _cuJacobianPoint, +- _cuPointMeasurementMap, _cuCameraMeasurementList, +- _cuCameraQListW.begin(), JJ, false, __use_radial_distortion); +- } +-} +- +-template +-void SparseBundleCPU::NormalizeDataF() { +- int incompatible_radial_distortion = 0; +- _cuMeasurements.resize(_num_imgpt * 2); +- if (__focal_normalize) { +- if (__focal_scaling == 1.0f) { +- //------------------------------------------------------------------ +- ////////////////////////////////////////////////////////////// +- vector focals(_num_camera); +- for (int i = 0; i < _num_camera; ++i) focals[i] = _camera_data[i].f; +- std::nth_element(focals.begin(), focals.begin() + _num_camera / 2, +- focals.end()); +- float median_focal_length = focals[_num_camera / 2]; +- __focal_scaling = __data_normalize_median / median_focal_length; +- Float radial_factor = median_focal_length * median_focal_length * 4.0f; +- +- /////////////////////////////// +- +- for (int i = 0; i < _num_imgpt * 2; ++i) { +- _cuMeasurements[i] = Float(_imgpt_data[i] * __focal_scaling); +- } +- for (int i = 0; i < _num_camera; ++i) { +- _camera_data[i].f *= __focal_scaling; +- if (!__use_radial_distortion) { +- } else if (__reset_initial_distortion) { +- _camera_data[i].radial = 0; +- } else if (_camera_data[i].distortion_type != __use_radial_distortion) { +- incompatible_radial_distortion++; +- _camera_data[i].radial = 0; +- } else if (__use_radial_distortion == -1) { +- _camera_data[i].radial *= radial_factor; +- } +- } +- if (__verbose_level > 2) +- std::cout << "Focal length normalized by " << __focal_scaling << '\n'; +- __reset_initial_distortion = false; +- } +- } else { +- if (__use_radial_distortion) { +- for (int i = 0; i < _num_camera; ++i) { +- if (__reset_initial_distortion) { +- _camera_data[i].radial = 0; +- } else if (_camera_data[i].distortion_type != __use_radial_distortion) { +- _camera_data[i].radial = 0; +- incompatible_radial_distortion++; +- } +- } +- __reset_initial_distortion = false; +- } +- std::copy(_imgpt_data, _imgpt_data + _cuMeasurements.size(), +- _cuMeasurements.begin()); +- } +- +- if (incompatible_radial_distortion) { +- std::cout << "ERROR: incompatible radial distortion input; reset to 0;\n"; +- } +-} +- +-template +-void SparseBundleCPU::NormalizeDataD() { +- if (__depth_scaling == 1.0f) { +- const float dist_bound = 1.0f; +- vector oz(_num_imgpt); +- vector cpdist1(_num_camera, dist_bound); +- vector cpdist2(_num_camera, -dist_bound); +- vector camnpj(_num_camera, 0), cambpj(_num_camera, 0); +- int bad_point_count = 0; +- for (int i = 0; i < _num_imgpt; ++i) { +- int cmidx = _camera_idx[i]; +- CameraT* cam = _camera_data + cmidx; +- float* rz = cam->m[2]; +- float* x = _point_data + 4 * _point_idx[i]; +- oz[i] = (rz[0] * x[0] + rz[1] * x[1] + rz[2] * x[2] + cam->t[2]); +- +- ///////////////////////////////////////////////// +- // points behind camera may causes big problem +- float ozr = oz[i] / cam->t[2]; +- if (fabs(ozr) < __depth_check_epsilon) { +- bad_point_count++; +- float px = cam->f * (cam->m[0][0] * x[0] + cam->m[0][1] * x[1] + +- cam->m[0][2] * x[2] + cam->t[0]); +- float py = cam->f * (cam->m[1][0] * x[0] + cam->m[1][1] * x[1] + +- cam->m[1][2] * x[2] + cam->t[1]); +- float mx = _imgpt_data[i * 2], my = _imgpt_data[2 * i + 1]; +- bool checkx = fabs(mx) > fabs(my); +- if ((checkx && px * oz[i] * mx < 0 && fabs(mx) > 64) || +- (!checkx && py * oz[i] * my < 0 && fabs(my) > 64)) { +- if (__verbose_level > 3) +- std::cout << "Warning: proj of #" << cmidx +- << " on the wrong side, oz = " << oz[i] << " (" +- << (px / oz[i]) << ',' << (py / oz[i]) << ") (" << mx +- << ',' << my << ")\n"; +- ///////////////////////////////////////////////////////////////////////// +- if (oz[i] > 0) +- cpdist2[cmidx] = 0; +- else +- cpdist1[cmidx] = 0; +- } +- if (oz[i] >= 0) +- cpdist1[cmidx] = std::min(cpdist1[cmidx], oz[i]); +- else +- cpdist2[cmidx] = std::max(cpdist2[cmidx], oz[i]); +- } +- if (oz[i] < 0) { +- __num_point_behind++; +- cambpj[cmidx]++; +- } +- camnpj[cmidx]++; +- } +- if (bad_point_count > 0 && __depth_degeneracy_fix) { +- if (!__focal_normalize || !__depth_normalize) +- std::cout << "Enable data normalization on degeneracy\n"; +- __focal_normalize = true; +- __depth_normalize = true; +- } +- if (__depth_normalize) { +- std::nth_element(oz.begin(), oz.begin() + _num_imgpt / 2, oz.end()); +- float oz_median = oz[_num_imgpt / 2]; +- float shift_min = std::min(oz_median * 0.001f, 1.0f); +- float dist_threshold = shift_min * 0.1f; +- __depth_scaling = (1.0 / oz_median) / __data_normalize_median; +- if (__verbose_level > 2) +- std::cout << "Depth normalized by " << __depth_scaling << " (" +- << oz_median << ")\n"; +- +- for (int i = 0; i < _num_camera; ++i) { +- // move the camera a little bit? +- if (!__depth_degeneracy_fix) { +- } else if ((cpdist1[i] < dist_threshold || +- cpdist2[i] > -dist_threshold)) { +- float shift_epsilon = fabs(_camera_data[i].t[2] * FLT_EPSILON); +- float shift = std::max(shift_min, shift_epsilon); +- bool boths = +- cpdist1[i] < dist_threshold && cpdist2[i] > -dist_threshold; +- _camera_data[i].t[2] += shift; +- if (__verbose_level > 3) +- std::cout << "Adjust C" << std::setw(5) << i << " by " +- << std::setw(12) << shift << " [B" << std::setw(2) +- << cambpj[i] << "/" << std::setw(5) << camnpj[i] << "] [" +- << (boths ? 'X' : ' ') << "][" << cpdist1[i] << ", " +- << cpdist2[i] << "]\n"; +- __num_camera_modified++; +- } +- _camera_data[i].t[0] *= __depth_scaling; +- _camera_data[i].t[1] *= __depth_scaling; +- _camera_data[i].t[2] *= __depth_scaling; +- } +- for (int i = 0; i < _num_point; ++i) { +- ///////////////////////////////// +- _point_data[4 * i + 0] *= __depth_scaling; +- _point_data[4 * i + 1] *= __depth_scaling; +- _point_data[4 * i + 2] *= __depth_scaling; +- } +- } +- if (__num_point_behind > 0) +- std::cout << "WARNING: " << __num_point_behind +- << " points are behind cameras.\n"; +- if (__num_camera_modified > 0) +- std::cout << "WARNING: " << __num_camera_modified +- << " camera moved to avoid degeneracy.\n"; +- } +-} +- +-template +-void SparseBundleCPU::DenormalizeData() { +- if (__focal_normalize && __focal_scaling != 1.0f) { +- float squared_focal_factor = (__focal_scaling * __focal_scaling); +- for (int i = 0; i < _num_camera; ++i) { +- _camera_data[i].f /= __focal_scaling; +- if (__use_radial_distortion == -1) +- _camera_data[i].radial *= squared_focal_factor; +- _camera_data[i].distortion_type = __use_radial_distortion; +- } +- _projection_sse /= squared_focal_factor; +- __focal_scaling = 1.0f; +- } else if (__use_radial_distortion) { +- for (int i = 0; i < _num_camera; ++i) +- _camera_data[i].distortion_type = __use_radial_distortion; +- } +- +- if (__depth_normalize && __depth_scaling != 1.0f) { +- for (int i = 0; i < _num_camera; ++i) { +- _camera_data[i].t[0] /= __depth_scaling; +- _camera_data[i].t[1] /= __depth_scaling; +- _camera_data[i].t[2] /= __depth_scaling; +- } +- for (int i = 0; i < _num_point; ++i) { +- _point_data[4 * i + 0] /= __depth_scaling; +- _point_data[4 * i + 1] /= __depth_scaling; +- _point_data[4 * i + 2] /= __depth_scaling; +- } +- __depth_scaling = 1.0f; +- } +-} +- +-template +-int SparseBundleCPU::SolveNormalEquationPCGX(float lambda) { +- //---------------------------------------------------------- +- //(Jt * J + lambda * diag(Jt * J)) X = Jt * e +- //------------------------------------------------------------- +- TimerBA timer(this, TIMER_CG_ITERATION); +- __recent_cg_status = ' '; +- +- // diagonal for jacobian preconditioning... +- int plen = GetParameterLength(); +- VectorF null; +- VectorF& VectorDP = __lm_use_diagonal_damp ? _cuVectorJJ : null; // diagonal +- ComputeBlockPC(lambda, __lm_use_diagonal_damp); +- +- //////////////////////////////////////////////// +- +- /////////////////////////////////////////////////////// +- // B = [BC 0 ; 0 BP] +- // m = [mc 0; 0 mp]; +- // A x= BC * x - JcT * Jp * mp * JpT * Jc * x +- // = JcT * Jc x + lambda * D * x + ........ +- //////////////////////////////////////////////////////////// +- +- VectorF r; +- r.set(_cuVectorRK.data(), 8 * _num_camera); +- VectorF p; +- p.set(_cuVectorPK.data(), 8 * _num_camera); +- VectorF z; +- z.set(_cuVectorZK.data(), 8 * _num_camera); +- VectorF x; +- x.set(_cuVectorXK.data(), 8 * _num_camera); +- VectorF d; +- d.set(VectorDP.data(), 8 * _num_camera); +- +- VectorF& u = _cuVectorRK; +- VectorF& v = _cuVectorPK; +- VectorF up; +- up.set(u.data() + 8 * _num_camera, 3 * _num_point); +- VectorF vp; +- vp.set(v.data() + 8 * _num_camera, 3 * _num_point); +- VectorF uc; +- uc.set(z.data(), 8 * _num_camera); +- +- VectorF& e = _cuVectorJX; +- VectorF& e2 = _cuImageProj; +- +- ApplyBlockPC(_cuVectorJtE, u, 2); +- ComputeJX(u, e, 2); +- ComputeJtE(e, uc, 1); +- ComputeSAXPY(Float(-1.0f), uc, _cuVectorJtE, r); // r +- ApplyBlockPC(r, p, 1); // z = p = M r +- +- float_t rtz0 = (float_t)ComputeVectorDot(r, p); // r(0)' * z(0) +- ComputeJX(p, e, 1); // Jc * x +- ComputeJtE(e, u, 2); // JpT * jc * x +- ApplyBlockPC(u, v, 2); +- float_t qtq0 = +- (float_t)ComputeVectorNorm(e, __num_cpu_thread[FUNC_VS]); // q(0)' * q(0) +- float_t pdp0 = (float_t)ComputeVectorNormW(p, d); // p(0)' * DDD * p(0) +- float_t uv0 = (float_t)ComputeVectorDot(up, vp); +- float_t alpha0 = rtz0 / (qtq0 + lambda * pdp0 - uv0); +- +- if (__verbose_cg_iteration) +- std::cout << " --0,\t alpha = " << alpha0 +- << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n"; +- if (!std::isfinite(alpha0)) { +- return 0; +- } +- if (alpha0 == 0) { +- __recent_cg_status = 'I'; +- return 1; +- } +- +- //////////////////////////////////////////////////////////// +- ComputeSAX((Float)alpha0, p, x); // x(k+1) = x(k) + a(k) * p(k) +- ComputeJX(v, e2, 2); // //Jp * mp * JpT * JcT * p +- ComputeSAXPY(Float(-1.0f), e2, e, e, __num_cpu_thread[FUNC_VV]); +- ComputeJtE(e, uc, 1); // JcT * .... +- ComputeSXYPZ((Float)lambda, d, p, uc, uc); +- ComputeSAXPY((Float)-alpha0, uc, r, r); // r(k + 1) = r(k) - a(k) * A * pk +- +- ////////////////////////////////////////////////////////////////////////// +- float_t rtzk = rtz0, rtz_min = rtz0, betak; +- int iteration = 1; +- ++__num_cg_iteration; +- +- while (true) { +- ApplyBlockPC(r, z, 1); +- +- /////////////////////////////////////////////////////////////////////////// +- float_t rtzp = rtzk; +- rtzk = (float_t)ComputeVectorDot( +- r, z); //[r(k + 1) = M^(-1) * z(k + 1)] * z(k+1) +- float_t rtz_ratio = sqrt(fabs(rtzk / rtz0)); +- if (rtz_ratio < __cg_norm_threshold) { +- if (__recent_cg_status == ' ') +- __recent_cg_status = iteration < std::min(10, __cg_min_iteration) +- ? '0' + iteration +- : 'N'; +- if (iteration >= __cg_min_iteration) break; +- } +- //////////////////////////////////////////////////////////////////////////// +- betak = rtzk / rtzp; // beta +- rtz_min = std::min(rtz_min, rtzk); +- +- ComputeSAXPY((Float)betak, p, z, p); // p(k) = z(k) + b(k) * p(k - 1) +- ComputeJX(p, e, 1); // Jc * p +- ComputeJtE(e, u, 2); // JpT * jc * p +- ApplyBlockPC(u, v, 2); +- ////////////////////////////////////////////////////////////////////// +- +- float_t qtqk = +- (float_t)ComputeVectorNorm(e, __num_cpu_thread[FUNC_VS]); // q(k)' q(k) +- float_t pdpk = (float_t)ComputeVectorNormW(p, d); // p(k)' * DDD * p(k) +- float_t uvk = (float_t)ComputeVectorDot(up, vp); +- float_t alphak = rtzk / (qtqk + lambda * pdpk - uvk); +- +- ///////////////////////////////////////////////////// +- if (__verbose_cg_iteration) +- std::cout << " --" << iteration << ",\t alpha= " << alphak +- << ", rtzk/rtz0 = " << rtz_ratio +- << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n"; +- +- /////////////////////////////////////////////////// +- if (!std::isfinite(alphak) || rtz_ratio > __cg_norm_guard) { +- __recent_cg_status = 'X'; +- break; +- } // something doesn't converge.. +- +- //////////////////////////////////////////////// +- ComputeSAXPY((Float)alphak, p, x, x); // x(k+1) = x(k) + a(k) * p(k) +- +- ///////////////////////////////////////////////// +- ++iteration; +- ++__num_cg_iteration; +- if (iteration >= std::min(__cg_max_iteration, plen)) break; +- +- ComputeJX(v, e2, 2); // //Jp * mp * JpT * JcT * p +- ComputeSAXPY((Float)-1.0f, e2, e, e, __num_cpu_thread[FUNC_VV]); +- ComputeJtE(e, uc, 1); // JcT * .... +- ComputeSXYPZ((Float)lambda, d, p, uc, uc); +- ComputeSAXPY((Float)-alphak, uc, r, r); // r(k + 1) = r(k) - a(k) * A * pk +- } +- +- ComputeJX(x, e, 1); +- ComputeJtE(e, u, 2); +- VectorF jte_p; +- jte_p.set(_cuVectorJtE.data() + 8 * _num_camera, _num_point * 3); +- ComputeSAXPY((Float)-1.0f, up, jte_p, vp); +- ApplyBlockPC(v, _cuVectorXK, 2); +- return iteration; +-} +- +-template +-int SparseBundleCPU::SolveNormalEquationPCGB(float lambda) { +- //---------------------------------------------------------- +- //(Jt * J + lambda * diag(Jt * J)) X = Jt * e +- //------------------------------------------------------------- +- TimerBA timer(this, TIMER_CG_ITERATION); +- __recent_cg_status = ' '; +- +- // diagonal for jacobian preconditioning... +- int plen = GetParameterLength(); +- VectorF null; +- VectorF& VectorDP = __lm_use_diagonal_damp ? _cuVectorJJ : null; // diagonal +- VectorF& VectorQK = _cuVectorZK; // temporary +- ComputeBlockPC(lambda, __lm_use_diagonal_damp); +- +- //////////////////////////////////////////////////////// +- ApplyBlockPC(_cuVectorJtE, +- _cuVectorPK); // z(0) = p(0) = M * r(0)//r(0) = Jt * e +- ComputeJX(_cuVectorPK, _cuVectorJX); // q(0) = J * p(0) +- +- ////////////////////////////////////////////////// +- float_t rtz0 = +- (float_t)ComputeVectorDot(_cuVectorJtE, _cuVectorPK); // r(0)' * z(0) +- float_t qtq0 = (float_t)ComputeVectorNorm( +- _cuVectorJX, __num_cpu_thread[FUNC_VS]); // q(0)' * q(0) +- float_t ptdp0 = +- (float_t)ComputeVectorNormW(_cuVectorPK, VectorDP); // p(0)' * DDD * p(0) +- float_t alpha0 = rtz0 / (qtq0 + lambda * ptdp0); +- +- if (__verbose_cg_iteration) +- std::cout << " --0,\t alpha = " << alpha0 +- << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n"; +- if (!std::isfinite(alpha0)) { +- return 0; +- } +- if (alpha0 == 0) { +- __recent_cg_status = 'I'; +- return 1; +- } +- +- //////////////////////////////////////////////////////////// +- +- ComputeSAX((Float)alpha0, _cuVectorPK, +- _cuVectorXK); // x(k+1) = x(k) + a(k) * p(k) +- ComputeJtE(_cuVectorJX, VectorQK); // Jt * (J * p0) +- +- ComputeSXYPZ((Float)lambda, VectorDP, _cuVectorPK, VectorQK, +- VectorQK); // Jt * J * p0 + lambda * DDD * p0 +- +- ComputeSAXPY( +- (Float)-alpha0, VectorQK, _cuVectorJtE, +- _cuVectorRK); // r(k+1) = r(k) - a(k) * (Jt * q(k) + DDD * p(k)) ; +- +- float_t rtzk = rtz0, rtz_min = rtz0, betak; +- int iteration = 1; +- ++__num_cg_iteration; +- +- while (true) { +- ApplyBlockPC(_cuVectorRK, _cuVectorZK); +- +- /////////////////////////////////////////////////////////////////////////// +- float_t rtzp = rtzk; +- rtzk = (float_t)ComputeVectorDot( +- _cuVectorRK, _cuVectorZK); //[r(k + 1) = M^(-1) * z(k + 1)] * z(k+1) +- float_t rtz_ratio = sqrt(fabs(rtzk / rtz0)); +- if (rtz_ratio < __cg_norm_threshold) { +- if (__recent_cg_status == ' ') +- __recent_cg_status = iteration < std::min(10, __cg_min_iteration) +- ? '0' + iteration +- : 'N'; +- if (iteration >= __cg_min_iteration) break; +- } +- ////////////////////////////////////////////////////////////////////////// +- betak = rtzk / rtzp; // beta +- rtz_min = std::min(rtz_min, rtzk); +- +- ComputeSAXPY((Float)betak, _cuVectorPK, _cuVectorZK, +- _cuVectorPK); // p(k) = z(k) + b(k) * p(k - 1) +- ComputeJX(_cuVectorPK, _cuVectorJX); // q(k) = J * p(k) +- ////////////////////////////////////////////////////////////////////// +- +- float_t qtqk = (float_t)ComputeVectorNorm( +- _cuVectorJX, __num_cpu_thread[FUNC_VS]); // q(k)' q(k) +- float_t ptdpk = (float_t)ComputeVectorNormW( +- _cuVectorPK, VectorDP); // p(k)' * DDD * p(k) +- +- float_t alphak = rtzk / (qtqk + lambda * ptdpk); +- +- ///////////////////////////////////////////////////// +- if (__verbose_cg_iteration) +- std::cout << " --" << iteration << ",\t alpha= " << alphak +- << ", rtzk/rtz0 = " << rtz_ratio +- << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n"; +- +- /////////////////////////////////////////////////// +- if (!std::isfinite(alphak) || rtz_ratio > __cg_norm_guard) { +- __recent_cg_status = 'X'; +- break; +- } // something doesn't converge.. +- +- //////////////////////////////////////////////// +- ComputeSAXPY((Float)alphak, _cuVectorPK, _cuVectorXK, +- _cuVectorXK); // x(k+1) = x(k) + a(k) * p(k) +- +- ///////////////////////////////////////////////// +- ++iteration; +- ++__num_cg_iteration; +- if (iteration >= std::min(__cg_max_iteration, plen)) break; +- +- if (__cg_recalculate_freq > 0 && iteration % __cg_recalculate_freq == 0) { +- ////r = JtE - (Jt J + lambda * D) x +- ComputeJX(_cuVectorXK, _cuVectorJX); +- ComputeJtE(_cuVectorJX, VectorQK); +- ComputeSXYPZ((Float)lambda, VectorDP, _cuVectorXK, VectorQK, VectorQK); +- ComputeSAXPY((Float)-1.0f, VectorQK, _cuVectorJtE, _cuVectorRK); +- } else { +- ComputeJtE(_cuVectorJX, VectorQK); +- ComputeSXYPZ((Float)lambda, VectorDP, _cuVectorPK, VectorQK, +- VectorQK); // +- ComputeSAXPY( +- (Float)-alphak, VectorQK, _cuVectorRK, +- _cuVectorRK); // r(k+1) = r(k) - a(k) * (Jt * q(k) + DDD * p(k)) ; +- } +- } +- return iteration; +-} +- +-template +-int SparseBundleCPU::SolveNormalEquation(float lambda) { +- if (__bundle_current_mode == BUNDLE_ONLY_MOTION) { +- ComputeBlockPC(lambda, __lm_use_diagonal_damp); +- ApplyBlockPC(_cuVectorJtE, _cuVectorXK, 1); +- return 1; +- } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) { +- ComputeBlockPC(lambda, __lm_use_diagonal_damp); +- ApplyBlockPC(_cuVectorJtE, _cuVectorXK, 2); +- return 1; +- } else { +- ////solve linear system using Conjugate Gradients +- return __cg_schur_complement ? SolveNormalEquationPCGX(lambda) +- : SolveNormalEquationPCGB(lambda); +- } +-} +- +-template +-void SparseBundleCPU::DumpCooJacobian() { +- ////////// +- ofstream jo("j.txt"); +- int cn = __use_radial_distortion ? 8 : 7; +- int width = cn * _num_camera + 3 * _num_point; +- jo << "%%MatrixMarket matrix coordinate real general\n"; +- jo << (_num_imgpt * 2) << " " << width << " " << (cn + 3) * _num_imgpt * 2 +- << '\n'; +- for (int i = 0; i < _num_imgpt; ++i) { +- int ci = _camera_idx[i]; +- int pi = _point_idx[i]; +- int row = i * 2 + 1; +- // Float * jc = _cuJacobianCamera.data() + i * 16; +- // Float * jp = _cuJacobianPoint.data() + i * 6; +- int idx1 = ci * cn; +- int idx2 = _num_camera * cn + 3 * pi; +- +- for (int k = 0; k < 2; ++k, ++row) { +- for (int j = 0; j < cn; ++j) { +- jo << row << " " << (idx1 + j + 1) << " 1\n"; +- } +- for (int j = 0; j < 3; ++j) { +- jo << row << " " << (idx2 + j + 1) << " 1\n"; +- } +- } +- } +- +- ofstream jt("jt.txt"); +- jt << "%%MatrixMarket matrix coordinate real general\n"; +- jt << width << " " << (_num_imgpt * 2) << " " << (cn + 3) * _num_imgpt * 2 +- << '\n'; +- +- int* lisc = &_cuCameraMeasurementList[0]; +- int* mapc = &_cuCameraMeasurementMap[0]; +- int* mapp = &_cuPointMeasurementMap[0]; +- +- for (int i = 0; i < _num_camera; ++i) { +- int c0 = mapc[i]; +- int c1 = mapc[i + 1]; +- for (int k = 0; k < cn; ++k) { +- int row = i * cn + k + 1; +- for (int j = c0; j < c1; ++j) +- jt << row << " " << (lisc[j] * 2 + 1) << " 1\n" << row << " " +- << (2 * lisc[j] + 2) << " 1\n"; +- ; +- } +- } +- for (int i = 0; i < _num_point; ++i) { +- int p0 = mapp[i]; +- int p1 = mapp[i + 1]; +- for (int k = 0; k < 3; ++k) { +- int row = i * 3 + _num_camera * cn + k + 1; +- for (int j = p0; j < p1; ++j) +- jt << row << " " << (2 * j + 1) << " 1\n" << row << " " << (2 * j + 2) +- << " 1\n"; +- ; +- } +- } +-} +- +-template +-void SparseBundleCPU::RunTestIterationLM(bool reduced) { +- EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj); +- EvaluateJacobians(); +- ComputeJtE(_cuImageProj, _cuVectorJtE); +- if (reduced) +- SolveNormalEquationPCGX(__lm_initial_damp); +- else +- SolveNormalEquationPCGB(__lm_initial_damp); +- UpdateCameraPoint(_cuVectorZK, _cuImageProj); +- ComputeVectorDot(_cuVectorXK, _cuVectorJtE); +- ComputeJX(_cuVectorXK, _cuVectorJX); +- ComputeVectorNorm(_cuVectorJX, __num_cpu_thread[FUNC_VS]); +-} +- +-template +-float SparseBundleCPU::UpdateCameraPoint(VectorF& dx, +- VectorF& cuImageTempProj) { +- ConfigBA::TimerBA timer(this, TIMER_FUNCTION_UP, true); +- +- if (__bundle_current_mode == BUNDLE_ONLY_MOTION) { +- if (__jacobian_normalize) +- ComputeVXY(_cuVectorXK, _cuVectorSJ, dx, 8 * _num_camera); +- ProgramCPU::UpdateCameraPoint( +- _num_camera, _cuCameraData, _cuPointData, dx, _cuCameraDataEX, +- _cuPointDataEX, __bundle_current_mode, __num_cpu_thread[FUNC_VV]); +- return EvaluateProjection(_cuCameraDataEX, _cuPointData, cuImageTempProj); +- } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) { +- if (__jacobian_normalize) +- ComputeVXY(_cuVectorXK, _cuVectorSJ, dx, _num_point * POINT_ALIGN, +- _num_camera * 8); +- ProgramCPU::UpdateCameraPoint( +- _num_camera, _cuCameraData, _cuPointData, dx, _cuCameraDataEX, +- _cuPointDataEX, __bundle_current_mode, __num_cpu_thread[FUNC_VV]); +- return EvaluateProjection(_cuCameraData, _cuPointDataEX, cuImageTempProj); +- } else { +- if (__jacobian_normalize) ComputeVXY(_cuVectorXK, _cuVectorSJ, dx); +- ProgramCPU::UpdateCameraPoint( +- _num_camera, _cuCameraData, _cuPointData, dx, _cuCameraDataEX, +- _cuPointDataEX, __bundle_current_mode, __num_cpu_thread[FUNC_VV]); +- return EvaluateProjection(_cuCameraDataEX, _cuPointDataEX, cuImageTempProj); +- } +-} +- +-template +-float SparseBundleCPU::SaveUpdatedSystem(float residual_reduction, +- float dx_sqnorm, +- float damping) { +- float expected_reduction; +- if (__bundle_current_mode == BUNDLE_ONLY_MOTION) { +- VectorF xk; +- xk.set(_cuVectorXK.data(), 8 * _num_camera); +- VectorF jte; +- jte.set(_cuVectorJtE.data(), 8 * _num_camera); +- float dxtg = (float)ComputeVectorDot(xk, jte); +- if (__lm_use_diagonal_damp) { +- VectorF jj; +- jj.set(_cuVectorJJ.data(), 8 * _num_camera); +- float dq = (float)ComputeVectorNormW(xk, jj); +- expected_reduction = damping * dq + dxtg; +- } else { +- expected_reduction = damping * dx_sqnorm + dxtg; +- } +- _cuCameraData.swap(_cuCameraDataEX); +- } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) { +- VectorF xk; +- xk.set(_cuVectorXK.data() + 8 * _num_camera, POINT_ALIGN * _num_point); +- VectorF jte; +- jte.set(_cuVectorJtE.data() + 8 * _num_camera, POINT_ALIGN * _num_point); +- float dxtg = (float)ComputeVectorDot(xk, jte); +- if (__lm_use_diagonal_damp) { +- VectorF jj; +- jj.set(_cuVectorJJ.data() + 8 * _num_camera, POINT_ALIGN * _num_point); +- float dq = (float)ComputeVectorNormW(xk, jj); +- expected_reduction = damping * dq + dxtg; +- } else { +- expected_reduction = damping * dx_sqnorm + dxtg; +- } +- _cuPointData.swap(_cuPointDataEX); +- } else { +- float dxtg = (float)ComputeVectorDot(_cuVectorXK, _cuVectorJtE); +- if (__accurate_gain_ratio) { +- ComputeJX(_cuVectorXK, _cuVectorJX); +- float njx = +- (float)ComputeVectorNorm(_cuVectorJX, __num_cpu_thread[FUNC_VS]); +- expected_reduction = 2.0f * dxtg - njx; +- +- // could the expected reduction be negative??? not sure +- if (expected_reduction <= 0) +- expected_reduction = 0.001f * residual_reduction; +- } else if (__lm_use_diagonal_damp) { +- float dq = (float)ComputeVectorNormW(_cuVectorXK, _cuVectorJJ); +- expected_reduction = damping * dq + dxtg; +- } else { +- expected_reduction = damping * dx_sqnorm + dxtg; +- } +- /// save the new motion/struture +- _cuCameraData.swap(_cuCameraDataEX); +- _cuPointData.swap(_cuPointDataEX); +- } +- //////////////////////////////////////////// +- return float(residual_reduction / expected_reduction); +-} +- +-template +-void SparseBundleCPU::AdjustBundleAdjsutmentMode() { +- if (__bundle_current_mode == BUNDLE_ONLY_MOTION) { +- _cuJacobianPoint.resize(0); +- } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) { +- _cuJacobianCamera.resize(0); +- _cuJacobianCameraT.resize(0); +- } +-} +- +-template +-float SparseBundleCPU::EvaluateDeltaNorm() { +- if (__bundle_current_mode == BUNDLE_ONLY_MOTION) { +- VectorF temp; +- temp.set(_cuVectorXK.data(), 8 * _num_camera); +- return (float)ComputeVectorNorm(temp); +- } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) { +- VectorF temp; +- temp.set(_cuVectorXK.data() + 8 * _num_camera, POINT_ALIGN * _num_point); +- return (float)ComputeVectorNorm(temp); +- } else { +- return (float)ComputeVectorNorm(_cuVectorXK); +- } +-} +- +-template +-void SparseBundleCPU::NonlinearOptimizeLM() { +- //////////////////////////////////////// +- TimerBA timer(this, TIMER_OPTIMIZATION); +- +- //////////////////////////////////////////////// +- float mse_convert_ratio = +- 1.0f / (_num_imgpt * __focal_scaling * __focal_scaling); +- float error_display_ratio = __verbose_sse ? _num_imgpt : 1.0f; +- const int edwidth = __verbose_sse ? 12 : 8; +- _projection_sse = +- EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj); +- __initial_mse = __final_mse = _projection_sse * mse_convert_ratio; +- +- // compute jacobian diagonals for normalization +- if (__jacobian_normalize) PrepareJacobianNormalization(); +- +- // evalaute jacobian +- EvaluateJacobians(); +- ComputeJtE(_cuImageProj, _cuVectorJtE); +- /////////////////////////////////////////////////////////////// +- if (__verbose_level) +- std::cout << "Initial " << (__verbose_sse ? "sumed" : "mean") +- << " squared error = " << __initial_mse * error_display_ratio +- << "\n----------------------------------------------\n"; +- +- ////////////////////////////////////////////////// +- VectorF& cuImageTempProj = _cuVectorJX; +- // VectorF& cuVectorTempJX = _cuVectorJX; +- VectorF& cuVectorDX = _cuVectorSJ.size() ? _cuVectorZK : _cuVectorXK; +- +- ////////////////////////////////////////////////// +- float damping_adjust = 2.0f, damping = __lm_initial_damp, g_norm, g_inf; +- SaveBundleRecord(0, _projection_sse * mse_convert_ratio, damping, g_norm, +- g_inf); +- +- //////////////////////////////////// +- std::cout << std::left; +- for (int i = 0; i < __lm_max_iteration && !__abort_flag; +- __current_iteration = (++i)) { +- ////solve linear system +- int num_cg_iteration = SolveNormalEquation(damping); +- +- // there must be NaN somewhere +- if (num_cg_iteration == 0) { +- if (__verbose_level) +- std::cout << "#" << std::setw(3) << i << " quit on numeric errors\n"; +- __pba_return_code = 'E'; +- break; +- } +- +- // there must be infinity somewhere +- if (__recent_cg_status == 'I') { +- std::cout << "#" << std::setw(3) << i << " 0 I e=" << std::setw(edwidth) +- << "------- " +- << " u=" << std::setprecision(3) << std::setw(9) << damping +- << '\n' << std::setprecision(6); +- /////////////increase damping factor +- damping = damping * damping_adjust; +- damping_adjust = 2.0f * damping_adjust; +- --i; +- continue; +- } +- +- ///////////////////// +- ++__num_lm_iteration; +- +- //////////////////////////////////// +- float dx_sqnorm = EvaluateDeltaNorm(), dx_norm = sqrt(dx_sqnorm); +- +- // In this library, we check absolute difference instead of realtive +- // difference +- if (dx_norm <= __lm_delta_threshold) { +- // damping factor must be way too big...or it converges +- if (__verbose_level > 1) +- std::cout << "#" << std::setw(3) << i << " " << std::setw(3) +- << num_cg_iteration << char(__recent_cg_status) +- << " quit on too small change (" << dx_norm << " < " +- << __lm_delta_threshold << ")\n"; +- __pba_return_code = 'S'; +- break; +- } +- /////////////////////////////////////////////////////////////////////// +- // update structure and motion, check reprojection error +- float new_residual = UpdateCameraPoint(cuVectorDX, cuImageTempProj); +- float average_residual = new_residual * mse_convert_ratio; +- float residual_reduction = _projection_sse - new_residual; +- +- // do we find a better solution? +- if (std::isfinite(new_residual) && residual_reduction > 0) { +- ////compute relative norm change +- float relative_reduction = 1.0f - (new_residual / _projection_sse); +- +- //////////////////////////////////// +- __num_lm_success++; // increase counter +- _projection_sse = new_residual; // save the new residual +- _cuImageProj.swap(cuImageTempProj); // save the new projection +- +- ////////////////////compute gain ratio/////////// +- float gain_ratio = +- SaveUpdatedSystem(residual_reduction, dx_sqnorm, damping); +- +- //////////////////////////////////////////////// +- SaveBundleRecord(i + 1, _projection_sse * mse_convert_ratio, damping, +- g_norm, g_inf); +- +- ///////////////////////////////////////////// +- if (__verbose_level > 1) +- std::cout << "#" << std::setw(3) << i << " " << std::setw(3) +- << num_cg_iteration << char(__recent_cg_status) +- << " e=" << std::setw(edwidth) +- << average_residual * error_display_ratio +- << " u=" << std::setprecision(3) << std::setw(9) << damping +- << " r=" << std::setw(6) +- << floor(gain_ratio * 1000.f) * 0.001f +- << " g=" << std::setw(g_norm > 0 ? 9 : 1) << g_norm << " " +- << std::setw(9) << relative_reduction << ' ' << std::setw(9) +- << dx_norm << " t=" << int(BundleTimerGetNow()) << "\n" +- << std::setprecision(6); +- +- ///////////////////////////// +- if (!IsTimeBudgetAvailable()) { +- if (__verbose_level > 1) +- std::cout << "#" << std::setw(3) << i << " used up time budget.\n"; +- __pba_return_code = 'T'; +- break; +- } else if (__lm_check_gradient && g_inf < __lm_gradient_threshold) { +- if (__verbose_level > 1) +- std::cout << "#" << std::setw(3) << i +- << " converged with small gradient\n"; +- __pba_return_code = 'G'; +- break; +- } else if (average_residual * error_display_ratio <= __lm_mse_threshold) { +- if (__verbose_level > 1) +- std::cout << "#" << std::setw(3) << i << " satisfies MSE threshold\n"; +- __pba_return_code = 'M'; +- break; +- } else { +- /////////////////////////////adjust damping factor +- float temp = gain_ratio * 2.0f - 1.0f; +- float adaptive_adjust = 1.0f - temp * temp * temp; // powf(, 3.0f); // +- float auto_adjust = std::max(1.0f / 3.0f, adaptive_adjust); +- +- ////////////////////////////////////////////////// +- damping = damping * auto_adjust; +- damping_adjust = 2.0f; +- if (damping < __lm_minimum_damp) +- damping = __lm_minimum_damp; +- else if (__lm_damping_auto_switch == 0 && damping > __lm_maximum_damp && +- __lm_use_diagonal_damp) +- damping = __lm_maximum_damp; +- +- EvaluateJacobians(); +- ComputeJtE(_cuImageProj, _cuVectorJtE); +- } +- } else { +- if (__verbose_level > 1) +- std::cout << "#" << std::setw(3) << i << " " << std::setw(3) +- << num_cg_iteration << char(__recent_cg_status) +- << " e=" << std::setw(edwidth) << std::left +- << average_residual * error_display_ratio +- << " u=" << std::setprecision(3) << std::setw(9) << damping +- << " r=----- " << (__lm_check_gradient || __save_gradient_norm +- ? " g=---------" +- : " g=0") +- << " --------- " << std::setw(9) << dx_norm +- << " t=" << int(BundleTimerGetNow()) << "\n" +- << std::setprecision(6); +- +- if (__lm_damping_auto_switch > 0 && __lm_use_diagonal_damp && +- damping > __lm_damping_auto_switch) { +- __lm_use_diagonal_damp = false; +- damping = __lm_damping_auto_switch; +- damping_adjust = 2.0f; +- if (__verbose_level > 1) +- std::cout << "NOTE: switch to damping with an identity matix\n"; +- } else { +- /////////////increase damping factor +- damping = damping * damping_adjust; +- damping_adjust = 2.0f * damping_adjust; +- } +- } +- +- if (__verbose_level == 1) std::cout << '.'; +- } +- +- __final_mse = float(_projection_sse * mse_convert_ratio); +- __final_mse_x = +- __use_radial_distortion +- ? EvaluateProjectionX(_cuCameraData, _cuPointData, _cuImageProj) * +- mse_convert_ratio +- : __final_mse; +-} +- +-#define PROFILE_REPORT2(A, T) \ +- std::cout << std::setw(24) << A << ": " << (T) << "\n"; +- +-#define PROFILE_REPORT(A) \ +- std::cout << std::setw(24) << A << ": " \ +- << (BundleTimerGet(TIMER_PROFILE_STEP) / repeat) << "\n"; +- +-#define PROFILE_(B) \ +- BundleTimerStart(TIMER_PROFILE_STEP); \ +- for (int i = 0; i < repeat; ++i) { \ +- B; \ +- } \ +- BundleTimerSwitch(TIMER_PROFILE_STEP); +- +-#define PROFILE(A, B) PROFILE_(A B) PROFILE_REPORT(#A) +-#define PROXILE(A, B) PROFILE_(B) PROFILE_REPORT(A) +-#define PROTILE(FID, A, B) \ +- { \ +- float tbest = FLT_MAX; \ +- int nbest = 1; \ +- int nto = nthread[FID]; \ +- { \ +- std::ostringstream os1; \ +- os1 << #A "(" << nto << ")"; \ +- PROXILE(os1.str(), A B); \ +- } \ +- for (int j = 1; j <= THREAD_NUM_MAX; j *= 2) { \ +- nthread[FID] = j; \ +- PROFILE_(A B); \ +- float t = BundleTimerGet(TIMER_PROFILE_STEP) / repeat; \ +- if (t > tbest) { \ +- if (j >= max(nto, 16)) break; \ +- } else { \ +- tbest = t; \ +- nbest = j; \ +- } \ +- } \ +- if (nto != 0) nthread[FID] = nbest; \ +- { \ +- std::ostringstream os; \ +- os << #A "(" << nbest << ")"; \ +- PROFILE_REPORT2(os.str(), tbest); \ +- } \ +- } +- +-#define PROTILE2(FID1, FID2, A, B) \ +- { \ +- int nt1 = nthread[FID1], nt2 = nthread[FID2]; \ +- { \ +- std::ostringstream os1; \ +- os1 << #A "(" << nt1 << "," << nt2 << ")"; \ +- PROXILE(os1.str(), A B); \ +- } \ +- float tbest = FLT_MAX; \ +- int nbest1 = 1, nbest2 = 1; \ +- nthread[FID2] = 1; \ +- for (int j = 1; j <= THREAD_NUM_MAX; j *= 2) { \ +- nthread[FID1] = j; \ +- PROFILE_(A B); \ +- float t = BundleTimerGet(TIMER_PROFILE_STEP) / repeat; \ +- if (t > tbest) { \ +- if (j >= max(nt1, 16)) break; \ +- } else { \ +- tbest = t; \ +- nbest1 = j; \ +- } \ +- } \ +- nthread[FID1] = nbest1; \ +- for (int j = 2; j <= THREAD_NUM_MAX; j *= 2) { \ +- nthread[FID2] = j; \ +- PROFILE_(A B); \ +- float t = BundleTimerGet(TIMER_PROFILE_STEP) / repeat; \ +- if (t > tbest) { \ +- if (j >= max(nt2, 16)) break; \ +- } else { \ +- tbest = t; \ +- nbest2 = j; \ +- } \ +- } \ +- nthread[FID2] = nbest2; \ +- { \ +- std::ostringstream os; \ +- os << #A "(" << nbest1 << "," << nbest2 << ")"; \ +- PROFILE_REPORT2(os.str(), tbest); \ +- } \ +- if (nt1 == 0) nthread[FID1] = 0; \ +- if (nt2 == 0) nthread[FID2] = 0; \ +- } +- +-template +-void SparseBundleCPU::RunProfileSteps() { +- const int repeat = std::max(__profile_pba, 1); +- int* nthread = __num_cpu_thread; +- std::cout << "---------------------------------\n" +- "| Run profiling steps (" +- << repeat << ") |\n" +- "---------------------------------\n" +- << std::left; +- ; +- +- /////////////////////////////////////////////// +- EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj); +- if (__jacobian_normalize) PrepareJacobianNormalization(); +- EvaluateJacobians(); +- ComputeJtE(_cuImageProj, _cuVectorJtE); +- ComputeBlockPC(__lm_initial_damp, true); +- /////////////////////////////// +- do { +- if (SolveNormalEquationPCGX(__lm_initial_damp) == 10 && +- SolveNormalEquationPCGB(__lm_initial_damp) == 10) +- break; +- __lm_initial_damp *= 2.0f; +- } while (__lm_initial_damp < 1024.0f); +- std::cout << "damping set to " << __lm_initial_damp << " for profiling\n" +- << "---------------------------------\n"; +- /////////////////////// +- { +- int repeat = 10, cgmin = __cg_min_iteration, cgmax = __cg_max_iteration; +- __cg_max_iteration = __cg_min_iteration = 10; +- __num_cg_iteration = 0; +- PROFILE(SolveNormalEquationPCGX, (__lm_initial_damp)); +- if (__num_cg_iteration != 100) +- std::cout << __num_cg_iteration << " cg iterations in all\n"; +- ////////////////////////////////////////////////////// +- __num_cg_iteration = 0; +- PROFILE(SolveNormalEquationPCGB, (__lm_initial_damp)); +- if (__num_cg_iteration != 100) +- std::cout << __num_cg_iteration << " cg iterations in all\n"; +- std::cout << "---------------------------------\n"; +- ////////////////////////////////////////////////////// +- __num_cg_iteration = 0; +- PROXILE("Single iteration LMX", RunTestIterationLM(true)); +- if (__num_cg_iteration != 100) +- std::cout << __num_cg_iteration << " cg iterations in all\n"; +- ////////////////////////////////////////////////////// +- __num_cg_iteration = 0; +- PROXILE("Single iteration LMB", RunTestIterationLM(false)); +- if (__num_cg_iteration != 100) +- std::cout << __num_cg_iteration << " cg iterations in all\n"; +- std::cout << "---------------------------------\n"; +- __cg_max_iteration = cgmax; +- __cg_min_iteration = cgmin; +- } +- +- ///////////////////////////////////////////////////// +- PROFILE(UpdateCameraPoint, (_cuVectorZK, _cuImageProj)); +- PROFILE(ComputeVectorNorm, (_cuVectorXK)); +- PROFILE(ComputeVectorDot, (_cuVectorXK, _cuVectorRK)); +- PROFILE(ComputeVectorNormW, (_cuVectorXK, _cuVectorRK)); +- PROFILE(ComputeSAXPY, ((Float)0.01f, _cuVectorXK, _cuVectorRK, _cuVectorZK)); +- PROFILE(ComputeSXYPZ, +- ((Float)0.01f, _cuVectorXK, _cuVectorPK, _cuVectorRK, _cuVectorZK)); +- std::cout << "---------------------------------\n"; +- PROTILE(FUNC_VS, ComputeVectorNorm, +- (_cuImageProj, nthread[FUNC_VS])); // reset the parameter to 0 +- +- /////////////////////////////////////// +- { +- avec temp1(_cuImageProj.size()), temp2(_cuImageProj.size()); +- SetVectorZero(temp1); +- PROTILE(FUNC_VV, ComputeSAXPY, +- ((Float)0.01f, _cuImageProj, temp1, temp2, nthread[FUNC_VV])); +- } +- +- std::cout << "---------------------------------\n"; +- __multiply_jx_usenoj = false; +- +- //////////////////////////////////////////////////// +- PROTILE(FUNC_PJ, EvaluateProjection, +- (_cuCameraData, _cuPointData, _cuImageProj)); +- PROTILE2(FUNC_MPC, FUNC_MPP, ApplyBlockPC, (_cuVectorJtE, _cuVectorPK)); +- +- ///////////////////////////////////////////////// +- if (!__no_jacobian_store) { +- if (__jc_store_original) { +- PROTILE(FUNC_JX, ComputeJX, (_cuVectorJtE, _cuVectorJX)); +- +- if (__jc_store_transpose) { +- PROTILE(FUNC_JJ_JCO_JCT_JP, EvaluateJacobians, ()); +- PROTILE2(FUNC_JTEC_JCT, FUNC_JTEP, ComputeJtE, +- (_cuImageProj, _cuVectorJtE)); +- PROTILE2(FUNC_BCC_JCT, FUNC_BCP, ComputeBlockPC, (0.001f, true)); +- PROFILE(ComputeDiagonal, (_cuVectorPK)); +- +- std::cout << "---------------------------------\n" +- "| Not storing original JC | \n" +- "---------------------------------\n"; +- __jc_store_original = false; +- PROTILE(FUNC_JJ_JCT_JP, EvaluateJacobians, ()); +- __jc_store_original = true; +- } +- +- ////////////////////////////////////////////////// +- std::cout << "---------------------------------\n" +- "| Not storing transpose JC | \n" +- "---------------------------------\n"; +- __jc_store_transpose = false; +- _cuJacobianCameraT.resize(0); +- PROTILE(FUNC_JJ_JCO_JP, EvaluateJacobians, ()); +- PROTILE2(FUNC_JTEC_JCO, FUNC_JTEP, ComputeJtE, +- (_cuImageProj, _cuVectorJtE)); +- PROTILE2(FUNC_BCC_JCO, FUNC_BCP, ComputeBlockPC, (0.001f, true)); +- PROFILE(ComputeDiagonal, (_cuVectorPK)); +- } else if (__jc_store_transpose) { +- PROTILE2(FUNC_JTEC_JCT, FUNC_JTEP, ComputeJtE, +- (_cuImageProj, _cuVectorJtE)); +- PROTILE2(FUNC_BCC_JCT, FUNC_BCP, ComputeBlockPC, (0.001f, true)); +- PROFILE(ComputeDiagonal, (_cuVectorPK)); +- +- std::cout << "---------------------------------\n" +- "| Not storing original JC | \n" +- "---------------------------------\n"; +- PROTILE(FUNC_JJ_JCT_JP, EvaluateJacobians, ()); +- } +- } +- +- if (!__no_jacobian_store) { +- std::cout << "---------------------------------\n" +- "| Not storing Camera Jacobians | \n" +- "---------------------------------\n"; +- __jc_store_transpose = false; +- __jc_store_original = false; +- _cuJacobianCamera.resize(0); +- _cuJacobianCameraT.resize(0); +- PROTILE(FUNC_JJ_JP, EvaluateJacobians, ()); +- PROTILE(FUNC_JTE_, ComputeJtE, (_cuImageProj, _cuVectorJtE)); +- // PROFILE(ComputeBlockPC, (0.001f, true)); +- } +- +- /////////////////////////////////////////////// +- std::cout << "---------------------------------\n" +- "| Not storing any jacobians |\n" +- "---------------------------------\n"; +- __no_jacobian_store = true; +- _cuJacobianPoint.resize(0); +- PROTILE(FUNC_JX_, ComputeJX, (_cuVectorJtE, _cuVectorJX)); +- PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE)); +- PROFILE(ComputeBlockPC, (0.001f, true)); +- std::cout << "---------------------------------\n"; +-} +- +-template +-int SparseBundleCPU::FindProcessorCoreNum() { +-#ifdef _WIN32 +-#if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_FAMILY_APP +- SYSTEM_INFO sysinfo; +- GetNativeSystemInfo(&sysinfo); +-#else +- SYSTEM_INFO sysinfo; +- GetSystemInfo(&sysinfo); +-#endif +- return sysinfo.dwNumberOfProcessors; +-#else +- return sysconf(_SC_NPROCESSORS_ONLN); +-#endif +-} +- +-ParallelBA* NewSparseBundleCPU(bool dp, const int num_threads) { +-#ifndef SIMD_NO_DOUBLE +- if (dp) +- return new SparseBundleCPU(num_threads); +- else +-#endif +- return new SparseBundleCPU(num_threads); +-} +- +-} // namespace pba +diff --git a/lib/PBA/SparseBundleCPU.h b/lib/PBA/SparseBundleCPU.h +deleted file mode 100644 +index 73beb9e10..000000000 +--- a/lib/PBA/SparseBundleCPU.h ++++ /dev/null +@@ -1,286 +0,0 @@ +-//////////////////////////////////////////////////////////////////////////// +-// File: SparseBundleCPU.h +-// Author: Changchang Wu (ccwu@cs.washington.edu) +-// Description : interface of the CPU-version of multi-core bundle adjustment +-// +-// Copyright (c) 2011 Changchang Wu (ccwu@cs.washington.edu) +-// and the University of Washington at Seattle +-// +-// This library is free software; you can redistribute it and/or +-// modify it under the terms of the GNU General Public +-// License as published by the Free Software Foundation; either +-// Version 3 of the License, or (at your option) any later version. +-// +-// This library is distributed in the hope that it will be useful, +-// but WITHOUT ANY WARRANTY; without even the implied warranty of +-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +-// General Public License for more details. +-// +-//////////////////////////////////////////////////////////////////////////////// +- +-#if !defined(SPARSE_BUNDLE_CPU_H) +-#define SPARSE_BUNDLE_CPU_H +- +-// BYTE-ALIGNMENT for data allocation (16 required for SSE, 32 required for AVX) +-// PREVIOUS version uses only SSE. The new version will include AVX. +-// SO the alignment is increased from 16 to 32 +-#define VECTOR_ALIGNMENT 32 +-#define FLOAT_ALIGN 8 +-#define VECTOR_ALIGNMENT_MASK (VECTOR_ALIGNMENT - 1) +-#define ALIGN_PTR(p) \ +- ((((size_t)p) + VECTOR_ALIGNMENT_MASK) & (~VECTOR_ALIGNMENT_MASK)) +- +-namespace pba { +- +-template +-class avec { +- bool _owner; +- Float* _data; +- Float* _last; +- size_t _size; +- size_t _capacity; +- +- public: +- static Float* allocate(size_t count) { +- size_t size = count * sizeof(Float); +-#ifdef _MSC_VER +- Float* p = (Float*)_aligned_malloc(size, VECTOR_ALIGNMENT); +- if (p == NULL) throw std::bad_alloc(); +- return p; +-#else +- char* p = (char*)malloc(size + VECTOR_ALIGNMENT + 4); +- if (p == NULL) throw std::bad_alloc(); +- char* p1 = p + 1; +- char* p2 = +- (char*)ALIGN_PTR(p1); //(char*) (((((size_t)p1) + 15) >> 4) << 4); +- char* p3 = (p2 - 1); +- p3[0] = (p2 - p); +- return (Float*)p2; +-#endif +- } +- static void deallocate(void* p) { +-#ifdef _MSC_VER +- _aligned_free(p); +-#else +- char* p3 = ((char*)p) - 1; +- free(((char*)p) - p3[0]); +-#endif +- } +- +- public: +- avec() { +- _owner = true; +- _last = _data = NULL; +- _size = _capacity = 0; +- } +- avec(size_t count) { +- _data = allocate(count); +- _size = _capacity = count; +- _last = _data + count; +- _owner = true; +- } +- ~avec() { +- if (_data && _owner) deallocate(_data); +- } +- +- inline void resize(size_t newcount) { +- if (!_owner) { +- _data = _last = NULL; +- _capacity = _size = 0; +- _owner = true; +- } +- if (newcount <= _capacity) { +- _size = newcount; +- _last = _data + newcount; +- } else { +- if (_data && _owner) deallocate(_data); +- _data = allocate(newcount); +- _size = _capacity = newcount; +- _last = _data + newcount; +- } +- } +- +- inline void set(Float* data, size_t count) { +- if (_data && _owner) deallocate(_data); +- _data = data; +- _owner = false; +- _size = count; +- _last = _data + _size; +- _capacity = count; +- } +- inline void swap(avec& next) { +- bool _owner_bak = _owner; +- Float* _data_bak = _data; +- Float* _last_bak = _last; +- size_t _size_bak = _size; +- size_t _capa_bak = _capacity; +- +- _owner = next._owner; +- _data = next._data; +- _last = next._last; +- _size = next._size; +- _capacity = next._capacity; +- +- next._owner = _owner_bak; +- next._data = _data_bak; +- next._last = _last_bak; +- next._size = _size_bak; +- next._capacity = _capa_bak; +- } +- +- inline operator Float*() { return _size ? _data : NULL; } +- inline operator Float* const() const { return _data; } +- inline Float* begin() { return _size ? _data : NULL; } +- inline Float* data() { return _size ? _data : NULL; } +- inline Float* end() { return _last; } +- inline const Float* begin() const { return _size ? _data : NULL; } +- inline const Float* end() const { return _last; } +- inline size_t size() const { return _size; } +- inline size_t IsValid() const { return _size; } +- void SaveToFile(const char* name); +-}; +- +-template +-class SparseBundleCPU : public ParallelBA, public ConfigBA { +- public: +- SparseBundleCPU(const int num_threads); +- +- typedef avec VectorF; +- typedef std::vector VectorI; +- typedef float float_t; +- +- protected: // cpu data +- int _num_camera; +- int _num_point; +- int _num_imgpt; +- CameraT* _camera_data; +- float* _point_data; +- +- //////////////////////////////// +- const float* _imgpt_data; +- const int* _camera_idx; +- const int* _point_idx; +- const int* _focal_mask; +- +- ///////////sumed square error +- float _projection_sse; +- +- protected: // cuda data +- VectorF _cuCameraData; +- VectorF _cuCameraDataEX; +- VectorF _cuPointData; +- VectorF _cuPointDataEX; +- VectorF _cuMeasurements; +- VectorF _cuImageProj; +- VectorF _cuJacobianCamera; +- VectorF _cuJacobianPoint; +- VectorF _cuJacobianCameraT; +- VectorI _cuProjectionMap; +- VectorI _cuPointMeasurementMap; +- VectorI _cuCameraMeasurementMap; +- VectorI _cuCameraMeasurementList; +- VectorI _cuCameraMeasurementListT; +- +- ////////////////////////// +- VectorF _cuBlockPC; +- VectorF _cuVectorSJ; +- +- /// LM normal equation +- VectorF _cuVectorJtE; +- VectorF _cuVectorJJ; +- VectorF _cuVectorJX; +- VectorF _cuVectorXK; +- VectorF _cuVectorPK; +- VectorF _cuVectorZK; +- VectorF _cuVectorRK; +- +- ////////////////////////////////// +- protected: +- int _num_imgpt_q; +- float _weight_q; +- VectorI _cuCameraQList; +- VectorI _cuCameraQMap; +- VectorF _cuCameraQMapW; +- VectorF _cuCameraQListW; +- +- protected: +- bool ProcessIndexCameraQ(std::vector& qmap, std::vector& qlist); +- void ProcessWeightCameraQ(std::vector& cpnum, std::vector& qmap, +- Float* qmapw, Float* qlistw); +- +- protected: // internal functions +- int ValidateInputData(); +- int InitializeBundle(); +- int GetParameterLength(); +- void BundleAdjustment(); +- void NormalizeData(); +- void TransferDataToHost(); +- void DenormalizeData(); +- void NormalizeDataF(); +- void NormalizeDataD(); +- bool InitializeStorageForSFM(); +- bool InitializeStorageForCG(); +- +- void SaveBundleRecord(int iter, float res, float damping, float& g_norm, +- float& g_inf); +- +- protected: +- void PrepareJacobianNormalization(); +- void EvaluateJacobians(); +- void ComputeJtE(VectorF& E, VectorF& JtE, int mode = 0); +- void ComputeJX(VectorF& X, VectorF& JX, int mode = 0); +- void ComputeDiagonal(VectorF& JJI); +- void ComputeBlockPC(float lambda, bool dampd); +- void ApplyBlockPC(VectorF& v, VectorF& pv, int mode = 0); +- float UpdateCameraPoint(VectorF& dx, VectorF& cuImageTempProj); +- float EvaluateProjection(VectorF& cam, VectorF& point, VectorF& proj); +- float EvaluateProjectionX(VectorF& cam, VectorF& point, VectorF& proj); +- float SaveUpdatedSystem(float residual_reduction, float dx_sqnorm, +- float damping); +- float EvaluateDeltaNorm(); +- int SolveNormalEquationPCGB(float lambda); +- int SolveNormalEquationPCGX(float lambda); +- int SolveNormalEquation(float lambda); +- void NonlinearOptimizeLM(); +- void AdjustBundleAdjsutmentMode(); +- void RunProfileSteps(); +- void RunTestIterationLM(bool reduced); +- void DumpCooJacobian(); +- +- private: +- static int FindProcessorCoreNum(); +- +- public: +- virtual void AbortBundleAdjustment() { __abort_flag = true; } +- virtual int GetCurrentIteration() { return __current_iteration; } +- virtual void SetNextTimeBudget(int seconds) { +- __bundle_time_budget = seconds; +- } +- virtual void SetNextBundleMode(BundleModeT mode) { +- __bundle_mode_next = mode; +- } +- virtual void SetFixedIntrinsics(bool fixed) { __fixed_intrinsics = fixed; } +- virtual void EnableRadialDistortion(DistortionT type) { +- __use_radial_distortion = type; +- } +- virtual void ParseParam(int narg, char** argv) { +- ConfigBA::ParseParam(narg, argv); +- } +- virtual ConfigBA* GetInternalConfig() { return this; } +- +- public: +- SparseBundleCPU(); +- virtual void SetCameraData(size_t ncam, CameraT* cams); +- virtual void SetPointData(size_t npoint, Point3D* pts); +- virtual void SetProjection(size_t nproj, const Point2D* imgpts, +- const int* point_idx, const int* cam_idx); +- virtual void SetFocalMask(const int* fmask, float weight); +- virtual float GetMeanSquaredError(); +- virtual int RunBundleAdjustment(); +-}; +- +-ParallelBA* NewSparseBundleCPU(bool dp, const int num_threads); +- +-} // namespace pba +- +-#endif +diff --git a/lib/PBA/SparseBundleCU.cpp b/lib/PBA/SparseBundleCU.cpp +deleted file mode 100644 +index 95929971f..000000000 +--- a/lib/PBA/SparseBundleCU.cpp ++++ /dev/null +@@ -1,1989 +0,0 @@ +-//////////////////////////////////////////////////////////////////////////// +-// File: SparseBundleCU.cpp +-// Author: Changchang Wu +-// Description : implementation of the CUDA-based multicore bundle adjustment +-// +-// Copyright (c) 2011 Changchang Wu (ccwu@cs.washington.edu) +-// and the University of Washington at Seattle +-// +-// This library is free software; you can redistribute it and/or +-// modify it under the terms of the GNU General Public +-// License as published by the Free Software Foundation; either +-// Version 3 of the License, or (at your option) any later version. +-// +-// This library is distributed in the hope that it will be useful, +-// but WITHOUT ANY WARRANTY; without even the implied warranty of +-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +-// General Public License for more details. +-// +-//////////////////////////////////////////////////////////////////////////////// +- +-#include +-#include +-#include +-#include +-#include +-#include +-using std::vector; +-using std::cout; +-using std::pair; +-using std::ofstream; +- +-#include +-#include +-#include +-#include "pba.h" +-#include "SparseBundleCU.h" +- +-#include "ProgramCU.h" +- +-using namespace pba::ProgramCU; +- +-#ifdef _WIN32 +-#define finite _finite +-#endif +- +-namespace pba { +- +-typedef float float_t; // data type for host computation; double doesn't make +- // much difference +- +-#define CHECK_VEC(v1, v2) \ +- for (size_t j = 0; j < v1.size(); ++j) { \ +- if (v1[j] != v2[j]) { \ +- different++; \ +- std::cout << i << ' ' << j << ' ' << v1[j] << ' ' << v2[j] << '\n'; \ +- } \ +- } +-#define DEBUG_FUNCN(v, func, input, N) \ +- if (__debug_pba && v.IsValid()) { \ +- vector buf(v.GetLength()), buf_(v.GetLength()); \ +- for (int i = 0; i < N; ++i) { \ +- int different = 0; \ +- func input; \ +- ProgramCU::FinishWorkCUDA(); \ +- if (i > 0) { \ +- v.CopyToHost(&buf_[0]); \ +- CHECK_VEC(buf, buf_); \ +- } else { \ +- v.CopyToHost(&buf[0]); \ +- } \ +- if (different != 0) \ +- std::cout << #func << " : " << i << " : " << different << '\n'; \ +- } \ +- } +-#define DEBUG_FUNC(v, func, input) DEBUG_FUNCN(v, func, input, 2) +- +-SparseBundleCU::SparseBundleCU(int device) +- : ParallelBA(PBA_INVALID_DEVICE), +- _num_camera(0), +- _num_point(0), +- _num_imgpt(0), +- _num_imgpt_q(0), +- _camera_data(NULL), +- _point_data(NULL), +- _imgpt_data(NULL), +- _camera_idx(NULL), +- _point_idx(NULL), +- _projection_sse(0) { +- __selected_device = device; +-} +- +-size_t SparseBundleCU::GetMemCapacity() { +- if (__selected_device != __current_device) SetCudaDevice(__selected_device); +- size_t sz = ProgramCU::GetCudaMemoryCap(); +- if (sz < 1024) std::cout << "ERROR: CUDA is unlikely to be supported!\n"; +- return sz < 1024 ? 0 : sz; +-} +- +-void SparseBundleCU::SetCameraData(size_t ncam, CameraT* cams) { +- if (sizeof(CameraT) != 16 * sizeof(float)) exit(0); // never gonna happen...? +- _num_camera = (int)ncam; +- _camera_data = cams; +- _focal_mask = NULL; +-} +- +-void SparseBundleCU::SetFocalMask(const int* fmask, float weight) { +- _focal_mask = fmask; +- _weight_q = weight; +-} +- +-void SparseBundleCU::SetPointData(size_t npoint, Point3D* pts) { +- _num_point = (int)npoint; +- _point_data = (float*)pts; +-} +- +-void SparseBundleCU::SetProjection(size_t nproj, const Point2D* imgpts, +- const int* point_idx, const int* cam_idx) { +- _num_imgpt = (int)nproj; +- _imgpt_data = (float*)imgpts; +- _camera_idx = cam_idx; +- _point_idx = point_idx; +- _imgpt_datax.resize(0); +-} +- +-float SparseBundleCU::GetMeanSquaredError() { +- return float(_projection_sse / +- (_num_imgpt * __focal_scaling * __focal_scaling)); +-} +- +-void SparseBundleCU::BundleAdjustment() { +- if (ValidateInputData() != STATUS_SUCCESS) return; +- +- // +- +- //////////////////////// +- TimerBA timer(this, TIMER_OVERALL); +- +- NormalizeData(); +- if (InitializeBundle() != STATUS_SUCCESS) { +- // failed to allocate gpu storage +- } else if (__profile_pba) { +- // profiling some stuff +- RunProfileSteps(); +- } else { +- // real optimization +- AdjustBundleAdjsutmentMode(); +- NonlinearOptimizeLM(); +- TransferDataToHost(); +- } +- DenormalizeData(); +-} +- +-int SparseBundleCU::RunBundleAdjustment() { +- if (__warmup_device) WarmupDevice(); +- ResetBundleStatistics(); +- BundleAdjustment(); +- if (__num_lm_success > 0) +- SaveBundleStatistics(_num_camera, _num_point, _num_imgpt); +- if (__num_lm_success > 0) PrintBundleStatistics(); +- ResetTemporarySetting(); +- return __num_lm_success; +-} +- +-bool SparseBundleCU::InitializeBundleGPU() { +- bool previous_allocated = __memory_usage > 0; +- +- bool success = TransferDataToGPU() && InitializeStorageForCG(); +- if (!success && previous_allocated) { +- if (__verbose_level) std::cout << "WARNING: try clean allocation\n"; +- ClearPreviousError(); +- ReleaseAllocatedData(); +- success = TransferDataToGPU() && InitializeStorageForCG(); +- } +- +- if (!success && __jc_store_original) { +- if (__verbose_level) std::cout << "WARNING: try not storing original JC\n"; +- __jc_store_original = false; +- ClearPreviousError(); +- ReleaseAllocatedData(); +- success = TransferDataToGPU() && InitializeStorageForCG(); +- } +- if (!success && __jc_store_transpose) { +- if (__verbose_level) std::cout << "WARNING: try not storing transpose JC\n"; +- __jc_store_transpose = false; +- ClearPreviousError(); +- ReleaseAllocatedData(); +- success = TransferDataToGPU() && InitializeStorageForCG(); +- } +- if (!success && !__no_jacobian_store) { +- if (__verbose_level) std::cout << "WARNING: switch to memory saving mode\n"; +- __no_jacobian_store = true; +- ClearPreviousError(); +- ReleaseAllocatedData(); +- success = TransferDataToGPU() && InitializeStorageForCG(); +- } +- return success; +-} +- +-int SparseBundleCU::ValidateInputData() { +- if (_camera_data == NULL) return STATUS_CAMERA_MISSING; +- if (_point_data == NULL) return STATUS_POINT_MISSING; +- if (_imgpt_data == NULL) return STATUS_MEASURMENT_MISSING; +- if (_camera_idx == NULL || _point_idx == NULL) +- return STATUS_PROJECTION_MISSING; +- return STATUS_SUCCESS; +-} +- +-void SparseBundleCU::WarmupDevice() { +- std::cout << "Warm up device with storage allocation...\n"; +- if (__selected_device != __current_device) SetCudaDevice(__selected_device); +- CheckRequiredMemX(); +- InitializeBundleGPU(); +-} +- +-int SparseBundleCU::InitializeBundle() { +- ///////////////////////////////////////////////////// +- TimerBA timer(this, TIMER_GPU_ALLOCATION); +- if (__selected_device != __current_device) SetCudaDevice(__selected_device); +- CheckRequiredMemX(); +- ReserveStorageAuto(); +- if (!InitializeBundleGPU()) return STATUS_ALLOCATION_FAIL; +- return STATUS_SUCCESS; +-} +- +-int SparseBundleCU::GetParameterLength() { +- return _num_camera * 8 + 4 * _num_point; +-} +- +-bool SparseBundleCU::CheckRequiredMemX() { +- if (CheckRequiredMem(0)) return true; +- if (__jc_store_original) { +- if (__verbose_level) std::cout << "NOTE: not storing original JC\n"; +- __jc_store_original = false; +- if (CheckRequiredMem(1)) return true; +- } +- if (__jc_store_transpose) { +- if (__verbose_level) std::cout << "NOTE: not storing camera Jacobian\n"; +- __jc_store_transpose = false; +- if (CheckRequiredMem(1)) return true; +- } +- if (!__no_jacobian_store) { +- if (__verbose_level) std::cout << "NOTE: not storing any Jacobian\n"; +- __no_jacobian_store = true; +- if (CheckRequiredMem(1)) return true; +- } +- return false; +-} +- +-bool SparseBundleCU::CheckRequiredMem(int fresh) { +- int m = _num_camera, n = _num_point, k = _num_imgpt; +-#ifdef PBA_CUDA_ALLOCATE_MORE +- if (!fresh) { +- int m0 = _cuCameraData.GetReservedWidth(); +- m = std::max(m, m0); +- int n0 = _cuPointData.GetReservedWidth(); +- n = std::max(n, n0); +- int k0 = _cuMeasurements.GetReservedWidth(); +- k = std::max(k, k0); +- } +-#endif +- +- int p = 8 * m + 4 * n, q = _num_imgpt_q; +- size_t szn, total = GetCudaMemoryCap(); +- size_t sz0 = 800 * 600 * 2 * 4 * sizeof(float); // +- size_t szq = q > 0 ? (sizeof(float) * (q + m) * 4) : 0; +- size_t sz = sizeof(float) * (258 + 9 * n + 33 * m + 7 * k) + sz0; +- +- /////////////////////////////////// CG +- sz += p * 6 * sizeof(float); +- sz += ((__use_radial_distortion ? 64 : 56) * m + 12 * n) * sizeof(float); +- sz += (2 * (k + q) * sizeof(float)); +- if (sz > total) return false; +- +- ///////////////////////////////////// +- szn = (__no_jacobian_store ? 0 : (sizeof(float) * 8 * k)); +- if (sz + szn > total) +- __no_jacobian_store = false; +- else +- sz += szn; +- ///////////////////////////// +- szn = ((!__no_jacobian_store && __jc_store_transpose) ? 16 * k * sizeof(float) +- : 0); +- if (sz + szn > total) +- __jc_store_transpose = false; +- else +- sz += szn; +- /////////////////////////// +- szn = ((!__no_jacobian_store && __jc_store_original) ? 16 * k * sizeof(float) +- : 0); +- if (sz + szn > total) +- __jc_store_original = false; +- else +- sz += szn; +- /////////////////////////////// +- szn = ((!__no_jacobian_store && __jc_store_transpose && !__jc_store_original) +- ? k * sizeof(int) +- : 0); +- if (sz + szn > total) { +- __jc_store_transpose = false; +- sz -= (16 * k * sizeof(float)); +- } else +- sz += szn; +- +- return sz <= total; +-} +- +-void SparseBundleCU::ReserveStorage(size_t ncam, size_t npt, size_t nproj) { +- if (ncam <= 1 || npt <= 1 || nproj <= 1) { +- ReleaseAllocatedData(); +- // Reset the memory strategy to the default. +- __jc_store_transpose = true; +- __jc_store_original = true; +- __no_jacobian_store = false; +- } else { +- const int* camidx = _camera_idx; +- const int* ptidx = _point_idx; +- int ncam_ = _num_camera; +- int npt_ = _num_point; +- int nproj_ = _num_imgpt; +- +-#ifdef PBA_CUDA_ALLOCATE_MORE +- size_t ncam_reserved = _cuCameraData.GetReservedWidth(); +- size_t npt_reserved = _cuPointData.GetReservedWidth(); +- size_t nproj_reserved = _cuMeasurements.GetReservedWidth(); +- ncam = std::max(ncam, ncam_reserved); +- npt = std::max(npt, npt_reserved); +- nproj = std::max(nproj, nproj_reserved); +-#endif +- +- _camera_idx = NULL; +- _point_idx = NULL; +- _num_camera = (int)ncam; +- _num_point = (int)npt; +- _num_imgpt = (int)nproj; +- +- if (__verbose_level) +- std::cout << "Reserving storage for ncam = " << ncam << "; npt = " << npt +- << "; nproj = " << nproj << '\n'; +- InitializeBundleGPU(); +- +- _num_camera = ncam_; +- _num_point = npt_; +- _num_imgpt = nproj_; +- _camera_idx = camidx; +- _point_idx = ptidx; +- } +-} +- +-static size_t upgrade_dimension(size_t sz) { +- size_t x = 1; +- while (x < sz) x <<= 1; +- return x; +-} +- +-void SparseBundleCU::ReserveStorageAuto() { +- if (_cuCameraData.data() == NULL || _cuPointData.data() == NULL || +- _cuMeasurements.data() == NULL) +- return; +- ReserveStorage(upgrade_dimension(_num_camera), upgrade_dimension(_num_point), +- upgrade_dimension(_num_imgpt)); +-} +- +-#define REPORT_ALLOCATION(NAME) \ +- if (__verbose_allocation && NAME.GetDataSize() > 1024) \ +- std::cout << (NAME.GetDataSize() > 1024 * 1024 \ +- ? NAME.GetDataSize() / 1024 / 1024 \ +- : NAME.GetDataSize() / 1024) \ +- << (NAME.GetDataSize() > 1024 * 1024 ? "MB" : "KB") \ +- << "\t allocated for " #NAME "\n"; +- +-#define ASSERT_ALLOCATION(NAME) \ +- if (!success) { \ +- std::cerr << "WARNING: failed to allocate " \ +- << (__verbose_allocation ? #NAME "; size = " : "") \ +- << (total_sz / 1024 / 1024) << "MB + " \ +- << (NAME.GetRequiredSize() / 1024 / 1024) << "MB\n"; \ +- return false; \ +- } else { \ +- total_sz += NAME.GetDataSize(); \ +- REPORT_ALLOCATION(NAME); \ +- } +- +-#define CHECK_ALLOCATION(NAME) \ +- if (NAME.GetDataSize() == 0 && NAME.GetRequiredSize() > 0) { \ +- ClearPreviousError(); \ +- std::cerr << "WARNING: unable to allocate " #NAME ": " \ +- << (NAME.GetRequiredSize() / 1024 / 1024) << "MB\n"; \ +- } else { \ +- total_sz += NAME.GetDataSize(); \ +- REPORT_ALLOCATION(NAME); \ +- } +- +-#define ALLOCATE_REQUIRED_DATA(NAME, num, channels) \ +- { \ +- success &= NAME.InitTexture(num, 1, channels); \ +- ASSERT_ALLOCATION(NAME); \ +- } +- +-#define ALLOCATE_OPTIONAL_DATA(NAME, num, channels, option) \ +- if (option) { \ +- option = NAME.InitTexture(num, 1, channels); \ +- CHECK_ALLOCATION(NAME); \ +- } else { \ +- NAME.InitTexture(0, 0, 0); \ +- } +- +-bool SparseBundleCU::TransferDataToGPU() { +- // given m camera, npoint, k measurements.. the number of float is +- bool success = true; +- size_t total_sz = 0; +- +- ///////////////////////////////////////////////////////////////////////////// +- vector qmap, qlist; +- vector qmapw, qlistw; +- ProcessIndexCameraQ(qmap, qlist); +- +- ////////////////////////////////////////////////////////////////////////////// +- ALLOCATE_REQUIRED_DATA(_cuBufferData, 256, 1); // 256 +- ALLOCATE_REQUIRED_DATA(_cuPointData, _num_point, 4); // 4n +- ALLOCATE_REQUIRED_DATA(_cuCameraData, _num_camera, 16); // 16m +- ALLOCATE_REQUIRED_DATA(_cuCameraDataEX, _num_camera, 16); // 16m +- +- //////////////////////////////////////////////////////////////// +- ALLOCATE_REQUIRED_DATA(_cuCameraMeasurementMap, _num_camera + 1, 1); // m +- ALLOCATE_REQUIRED_DATA(_cuCameraMeasurementList, _num_imgpt, 1); // k +- ALLOCATE_REQUIRED_DATA(_cuPointMeasurementMap, _num_point + 1, 1); // n +- ALLOCATE_REQUIRED_DATA(_cuProjectionMap, _num_imgpt, 2); // 2k +- ALLOCATE_REQUIRED_DATA(_cuImageProj, _num_imgpt + _num_imgpt_q, 2); // 2k +- ALLOCATE_REQUIRED_DATA(_cuPointDataEX, _num_point, 4); // 4n +- ALLOCATE_REQUIRED_DATA(_cuMeasurements, _num_imgpt, 2); // 2k +- +- // +- ALLOCATE_REQUIRED_DATA(_cuCameraQMap, _num_imgpt_q, 2); +- ALLOCATE_REQUIRED_DATA(_cuCameraQMapW, _num_imgpt_q, 2); +- ALLOCATE_REQUIRED_DATA(_cuCameraQList, (_num_imgpt_q > 0 ? _num_camera : 0), +- 2); +- ALLOCATE_REQUIRED_DATA(_cuCameraQListW, (_num_imgpt_q > 0 ? _num_camera : 0), +- 2); +- +- if (__no_jacobian_store) { +- _cuJacobianCamera.ReleaseData(); +- _cuJacobianCameraT.ReleaseData(); +- _cuJacobianPoint.ReleaseData(); +- _cuCameraMeasurementListT.ReleaseData(); +- } else { +- ALLOCATE_REQUIRED_DATA(_cuJacobianPoint, _num_imgpt * 2, 4); // 8k +- ALLOCATE_OPTIONAL_DATA(_cuJacobianCameraT, _num_imgpt * 2, 8, +- __jc_store_transpose); // 16k +- ALLOCATE_OPTIONAL_DATA(_cuJacobianCamera, _num_imgpt * 2, 8, +- __jc_store_original); // 16k +- +- if ((!__jc_store_original || __profile_pba) && __jc_store_transpose) { +- ALLOCATE_OPTIONAL_DATA(_cuCameraMeasurementListT, _num_imgpt, 1, +- __jc_store_transpose); // k +- if (!__jc_store_transpose) _cuJacobianCameraT.ReleaseData(); +- } else { +- _cuCameraMeasurementListT.ReleaseData(); +- } +- } +- +- ///////////////////////////////////////////////// +- if (_camera_idx && _point_idx) { +- ////////////////////////////////////////// +- BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION); +- ////mapping from camera to measuremnts +- vector cpi(_num_camera + 1), cpidx(_num_imgpt); +- vector cpnum(_num_camera, 0); +- cpi[0] = 0; +- for (int i = 0; i < _num_imgpt; ++i) cpnum[_camera_idx[i]]++; +- for (int i = 1; i <= _num_camera; ++i) cpi[i] = cpi[i - 1] + cpnum[i - 1]; +- vector cptidx = cpi; +- for (int i = 0; i < _num_imgpt; ++i) cpidx[cptidx[_camera_idx[i]]++] = i; +- if (_num_imgpt_q > 0) ProcessWeightCameraQ(cpnum, qmap, qmapw, qlistw); +- BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION); +- +- /////////////////////////////////////////////////////////////////////////////// +- BundleTimerSwap(TIMER_GPU_UPLOAD, TIMER_GPU_ALLOCATION); +- _cuMeasurements.CopyFromHost(_imgpt_datax.size() > 0 ? &_imgpt_datax[0] +- : _imgpt_data); +- _cuCameraData.CopyFromHost(_camera_data); +- _cuPointData.CopyFromHost(_point_data); +- _cuCameraMeasurementMap.CopyFromHost(&cpi[0]); +- _cuCameraMeasurementList.CopyFromHost(&cpidx[0]); +- if (_cuCameraMeasurementListT.IsValid()) { +- vector ridx(_num_imgpt); +- for (int i = 0; i < _num_imgpt; ++i) ridx[cpidx[i]] = i; +- _cuCameraMeasurementListT.CopyFromHost(&ridx[0]); +- } +- if (_num_imgpt_q > 0) { +- _cuCameraQMap.CopyFromHost(&qmap[0]); +- _cuCameraQMapW.CopyFromHost(&qmapw[0]); +- _cuCameraQList.CopyFromHost(&qlist[0]); +- _cuCameraQListW.CopyFromHost(&qlistw[0]); +- } +- BundleTimerSwap(TIMER_GPU_UPLOAD, TIMER_GPU_ALLOCATION); +- +- //////////////////////////////////////////// +- ///////mapping from point to measurment +- BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION); +- vector ppi(_num_point + 1); +- for (int i = 0, last_point = -1; i < _num_imgpt; ++i) { +- int pt = _point_idx[i]; +- while (last_point < pt) ppi[++last_point] = i; +- } +- ppi[_num_point] = _num_imgpt; +- +- //////////projection map +- vector projection_map(_num_imgpt * 2); +- for (int i = 0; i < _num_imgpt; ++i) { +- int* imp = &projection_map[i * 2]; +- imp[0] = _camera_idx[i] * 2; +- imp[1] = _point_idx[i]; +- } +- BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION); +- +- ////////////////////////////////////////////////////////////// +- BundleTimerSwap(TIMER_GPU_UPLOAD, TIMER_GPU_ALLOCATION); +- _cuPointMeasurementMap.CopyFromHost(&ppi[0]); +- _cuProjectionMap.CopyFromHost(&projection_map[0]); +- BundleTimerSwap(TIMER_GPU_UPLOAD, TIMER_GPU_ALLOCATION); +- } +- +- __memory_usage = total_sz; +- if (__verbose_level > 1) +- std::cout << "Memory for Motion/Structure/Jacobian:\t" +- << (total_sz / 1024 / 1024) << "MB\n"; +- return success; +-} +- +-bool SparseBundleCU::ProcessIndexCameraQ(vector& qmap, +- vector& qlist) { +- // reset q-data +- qmap.resize(0); +- qlist.resize(0); +- _num_imgpt_q = 0; +- +- // verify input +- if (_camera_idx == NULL) return true; +- if (_point_idx == NULL) return true; +- if (_focal_mask == NULL) return true; +- if (_num_camera == 0) return true; +- if (_weight_q <= 0) return true; +- +- /////////////////////////////////////// +- +- int error = 0; +- vector temp(_num_camera * 2, -1); +- +- for (int i = 0; i < _num_camera; ++i) { +- int iq = _focal_mask[i]; +- if (iq > i) { +- error = 1; +- break; +- } +- if (iq < 0) continue; +- if (iq == i) continue; +- int ip = temp[2 * iq]; +- // float ratio = _camera_data[i].f / _camera_data[iq].f; +- // if(ratio < 0.01 || ratio > 100) +- //{ +- // std::cout << "Warning: constaraints on largely different camreas\n"; +- // continue; +- //}else +- if (_focal_mask[iq] != iq) { +- error = 1; +- break; +- } else if (ip == -1) { +- temp[2 * iq] = i; +- temp[2 * iq + 1] = i; +- temp[2 * i] = iq; +- temp[2 * i + 1] = iq; +- } else { +- // maintain double-linked list +- temp[2 * i] = ip; +- temp[2 * i + 1] = iq; +- temp[2 * ip + 1] = i; +- temp[2 * iq] = i; +- } +- } +- +- if (error) { +- std::cout << "Error: incorrect constraints\n"; +- _focal_mask = NULL; +- return false; +- } +- +- qlist.resize(_num_camera * 2, -1); +- for (int i = 0; i < _num_camera; ++i) { +- int inext = temp[2 * i + 1]; +- if (inext == -1) continue; +- qlist[2 * i] = _num_imgpt + _num_imgpt_q; +- qlist[2 * inext + 1] = _num_imgpt + _num_imgpt_q; +- qmap.push_back(i); +- qmap.push_back(inext); +- _num_imgpt_q++; +- } +- return true; +-} +- +-void SparseBundleCU::ProcessWeightCameraQ(vector& cpnum, vector& qmap, +- vector& qmapw, +- vector& qlistw) { +- // set average focal length and average radial distortion +- vector qpnum(_num_camera, 0), qcnum(_num_camera, 0); +- vector fs(_num_camera, 0), rs(_num_camera, 0); +- +- for (int i = 0; i < _num_camera; ++i) { +- int qi = _focal_mask[i]; +- if (qi == -1) continue; +- // float ratio = _camera_data[i].f / _camera_data[qi].f; +- // if(ratio < 0.01 || ratio > 100) continue; +- fs[qi] += _camera_data[i].f; +- rs[qi] += _camera_data[i].radial; +- qpnum[qi] += cpnum[i]; +- qcnum[qi] += 1.0f; +- } +- +- // this seems not really matter..they will converge anyway +- for (int i = 0; i < _num_camera; ++i) { +- int qi = _focal_mask[i]; +- if (qi == -1) continue; +- // float ratio = _camera_data[i].f / _camera_data[qi].f; +- // if(ratio < 0.01 || ratio > 100) continue; +- _camera_data[i].f = fs[qi] / qcnum[qi]; +- _camera_data[i].radial = rs[qi] / qcnum[qi]; +- } +- +- qmapw.resize(_num_imgpt_q * 2, 0); +- qlistw.resize(_num_camera * 2, 0); +- for (int i = 0; i < _num_imgpt_q; ++i) { +- int cidx = qmap[i * 2], qi = _focal_mask[cidx]; +- float wi = sqrt(qpnum[qi] / qcnum[qi]) * _weight_q; +- float wr = (__use_radial_distortion ? wi * _camera_data[qi].f : 0.0); +- qmapw[i * 2] = wi; +- qmapw[i * 2 + 1] = wr; +- qlistw[cidx * 2] = wi; +- qlistw[cidx * 2 + 1] = wr; +- } +-} +- +-void SparseBundleCU::ReleaseAllocatedData() { +- _cuCameraData.ReleaseData(); +- _cuCameraDataEX.ReleaseData(); +- _cuPointData.ReleaseData(); +- _cuPointDataEX.ReleaseData(); +- _cuMeasurements.ReleaseData(); +- _cuImageProj.ReleaseData(); +- _cuJacobianCamera.ReleaseData(); +- _cuJacobianPoint.ReleaseData(); +- _cuJacobianCameraT.ReleaseData(); +- _cuProjectionMap.ReleaseData(); +- _cuPointMeasurementMap.ReleaseData(); +- _cuCameraMeasurementMap.ReleaseData(); +- _cuCameraMeasurementList.ReleaseData(); +- _cuCameraMeasurementListT.ReleaseData(); +- _cuBufferData.ReleaseData(); +- _cuBlockPC.ReleaseData(); +- _cuVectorJtE.ReleaseData(); +- _cuVectorJJ.ReleaseData(); +- _cuVectorJX.ReleaseData(); +- _cuVectorXK.ReleaseData(); +- _cuVectorPK.ReleaseData(); +- _cuVectorZK.ReleaseData(); +- _cuVectorRK.ReleaseData(); +- _cuVectorSJ.ReleaseData(); +- _cuCameraQList.ReleaseData(); +- _cuCameraQMap.ReleaseData(); +- _cuCameraQMapW.ReleaseData(); +- _cuCameraQListW.ReleaseData(); +- ProgramCU::ResetCurrentDevice(); +-} +- +-void SparseBundleCU::NormalizeDataF() { +- int incompatible_radial_distortion = 0; +- if (__focal_normalize) { +- if (__focal_scaling == 1.0f) { +- //------------------------------------------------------------------ +- ////////////////////////////////////////////////////////////// +- vector focals(_num_camera); +- for (int i = 0; i < _num_camera; ++i) focals[i] = _camera_data[i].f; +- std::nth_element(focals.begin(), focals.begin() + _num_camera / 2, +- focals.end()); +- float median_focal_length = focals[_num_camera / 2]; +- __focal_scaling = __data_normalize_median / median_focal_length; +- float radial_factor = median_focal_length * median_focal_length * 4.0f; +- +- /////////////////////////////// +- _imgpt_datax.resize(_num_imgpt * 2); +- for (int i = 0; i < _num_imgpt * 2; ++i) +- _imgpt_datax[i] = _imgpt_data[i] * __focal_scaling; +- for (int i = 0; i < _num_camera; ++i) { +- _camera_data[i].f *= __focal_scaling; +- if (!__use_radial_distortion) { +- } else if (__reset_initial_distortion) { +- _camera_data[i].radial = 0; +- } else if (_camera_data[i].distortion_type != __use_radial_distortion) { +- incompatible_radial_distortion++; +- _camera_data[i].radial = 0; +- } else if (__use_radial_distortion == -1) { +- _camera_data[i].radial *= radial_factor; +- } +- } +- if (__verbose_level > 2) +- std::cout << "Focal length normalized by " << __focal_scaling << '\n'; +- __reset_initial_distortion = false; +- } +- } else { +- if (__use_radial_distortion) { +- for (int i = 0; i < _num_camera; ++i) { +- if (__reset_initial_distortion) { +- _camera_data[i].radial = 0; +- } else if (_camera_data[i].distortion_type != __use_radial_distortion) { +- _camera_data[i].radial = 0; +- incompatible_radial_distortion++; +- } +- } +- __reset_initial_distortion = false; +- } +- _imgpt_datax.resize(0); +- } +- +- if (incompatible_radial_distortion) { +- std::cout << "ERROR: incompatible radial distortion input; reset to 0;\n"; +- } +-} +- +-void SparseBundleCU::NormalizeDataD() { +- if (__depth_scaling == 1.0f) { +- const float dist_bound = 1.0f; +- vector oz(_num_imgpt); +- vector cpdist1(_num_camera, dist_bound); +- vector cpdist2(_num_camera, -dist_bound); +- vector camnpj(_num_camera, 0), cambpj(_num_camera, 0); +- int bad_point_count = 0; +- for (int i = 0; i < _num_imgpt; ++i) { +- int cmidx = _camera_idx[i]; +- CameraT* cam = _camera_data + cmidx; +- float* rz = cam->m[2]; +- float* x = _point_data + 4 * _point_idx[i]; +- oz[i] = (rz[0] * x[0] + rz[1] * x[1] + rz[2] * x[2] + cam->t[2]); +- +- ///////////////////////////////////////////////// +- // points behind camera may causes big problem +- float ozr = oz[i] / cam->t[2]; +- if (fabs(ozr) < __depth_check_epsilon) { +- bad_point_count++; +- float px = cam->f * (cam->m[0][0] * x[0] + cam->m[0][1] * x[1] + +- cam->m[0][2] * x[2] + cam->t[0]); +- float py = cam->f * (cam->m[1][0] * x[0] + cam->m[1][1] * x[1] + +- cam->m[1][2] * x[2] + cam->t[1]); +- float mx = _imgpt_data[i * 2], my = _imgpt_data[2 * i + 1]; +- bool checkx = fabs(mx) > fabs(my); +- if ((checkx && px * oz[i] * mx < 0 && fabs(mx) > 64) || +- (!checkx && py * oz[i] * my < 0 && fabs(my) > 64)) { +- if (__verbose_level > 3) +- std::cout << "Warning: proj of #" << cmidx +- << " on the wrong side, oz = " << oz[i] << " (" +- << (px / oz[i]) << ',' << (py / oz[i]) << ") (" << mx +- << ',' << my << ")\n"; +- ///////////////////////////////////////////////////////////////////////// +- if (oz[i] > 0) +- cpdist2[cmidx] = 0; +- else +- cpdist1[cmidx] = 0; +- } +- if (oz[i] >= 0) +- cpdist1[cmidx] = std::min(cpdist1[cmidx], oz[i]); +- else +- cpdist2[cmidx] = std::max(cpdist2[cmidx], oz[i]); +- } +- if (oz[i] < 0) { +- __num_point_behind++; +- cambpj[cmidx]++; +- } +- camnpj[cmidx]++; +- } +- if (bad_point_count > 0 && __depth_degeneracy_fix) { +- if (!__focal_normalize || !__depth_normalize) +- std::cout << "Enable data normalization on degeneracy\n"; +- __focal_normalize = true; +- __depth_normalize = true; +- } +- if (__depth_normalize) { +- std::nth_element(oz.begin(), oz.begin() + _num_imgpt / 2, oz.end()); +- float oz_median = oz[_num_imgpt / 2]; +- float shift_min = std::min(oz_median * 0.001f, 1.0f); +- float dist_threshold = shift_min * 0.1f; +- __depth_scaling = (1.0f / oz_median) / __data_normalize_median; +- if (__verbose_level > 2) +- std::cout << "Depth normalized by " << __depth_scaling << " (" +- << oz_median << ")\n"; +- +- for (int i = 0; i < _num_camera; ++i) { +- // move the camera a little bit? +- if (!__depth_degeneracy_fix) { +- } else if ((cpdist1[i] < dist_threshold || +- cpdist2[i] > -dist_threshold)) { +- float shift = shift_min; //(cpdist1[i] <= -cpdist2[i] ? shift_min : +- //-shift_min); +- // if(cpdist1[i] < dist_bound && cpdist2[i] > - dist_bound) shift = - +- // 0.5f * (cpdist1[i] + cpdist2[i]); +- bool boths = +- cpdist1[i] < dist_threshold && cpdist2[i] > -dist_threshold; +- _camera_data[i].t[2] += shift; +- if (__verbose_level > 3) +- std::cout << "Adjust C" << std::setw(5) << i << " by " +- << std::setw(12) << shift << " [B" << std::setw(2) +- << cambpj[i] << "/" << std::setw(5) << camnpj[i] << "] [" +- << (boths ? 'X' : ' ') << "][" << cpdist1[i] << ", " +- << cpdist2[i] << "]\n"; +- __num_camera_modified++; +- } +- _camera_data[i].t[0] *= __depth_scaling; +- _camera_data[i].t[1] *= __depth_scaling; +- _camera_data[i].t[2] *= __depth_scaling; +- } +- for (int i = 0; i < _num_point; ++i) { +- ///////////////////////////////// +- _point_data[4 * i + 0] *= __depth_scaling; +- _point_data[4 * i + 1] *= __depth_scaling; +- _point_data[4 * i + 2] *= __depth_scaling; +- } +- } +- if (__num_point_behind > 0) +- std::cout << "WARNING: " << __num_point_behind +- << " points are behind cameras.\n"; +- if (__num_camera_modified > 0) +- std::cout << "WARNING: " << __num_camera_modified +- << " camera moved to avoid degeneracy.\n"; +- } +-} +- +-void SparseBundleCU::NormalizeData() { +- TimerBA timer(this, TIMER_PREPROCESSING); +- NormalizeDataD(); +- NormalizeDataF(); +-} +- +-void SparseBundleCU::DenormalizeData() { +- if (__focal_normalize && __focal_scaling != 1.0f) { +- float squared_focal_factor = (__focal_scaling * __focal_scaling); +- for (int i = 0; i < _num_camera; ++i) { +- _camera_data[i].f /= __focal_scaling; +- if (__use_radial_distortion == -1) +- _camera_data[i].radial *= squared_focal_factor; +- _camera_data[i].distortion_type = __use_radial_distortion; +- } +- _projection_sse /= squared_focal_factor; +- __focal_scaling = 1.0f; +- _imgpt_datax.resize(0); +- } else if (__use_radial_distortion) { +- for (int i = 0; i < _num_camera; ++i) +- _camera_data[i].distortion_type = __use_radial_distortion; +- } +- +- if (__depth_normalize && __depth_scaling != 1.0f) { +- for (int i = 0; i < _num_camera; ++i) { +- _camera_data[i].t[0] /= __depth_scaling; +- _camera_data[i].t[1] /= __depth_scaling; +- _camera_data[i].t[2] /= __depth_scaling; +- } +- for (int i = 0; i < _num_point; ++i) { +- _point_data[4 * i + 0] /= __depth_scaling; +- _point_data[4 * i + 1] /= __depth_scaling; +- _point_data[4 * i + 2] /= __depth_scaling; +- } +- __depth_scaling = 1.0f; +- } +-} +- +-void SparseBundleCU::TransferDataToHost() { +- TimerBA timer(this, TIMER_GPU_DOWNLOAD); +- _cuCameraData.CopyToHost(_camera_data); +- _cuPointData.CopyToHost(_point_data); +-} +- +-float SparseBundleCU::EvaluateProjection(CuTexImage& cam, CuTexImage& point, +- CuTexImage& proj) { +- ++__num_projection_eval; +- ConfigBA::TimerBA timer(this, TIMER_FUNCTION_PJ, true); +- ComputeProjection(cam, point, _cuMeasurements, _cuProjectionMap, proj, +- __use_radial_distortion); +- if (_num_imgpt_q > 0) +- ComputeProjectionQ(cam, _cuCameraQMap, _cuCameraQMapW, proj, _num_imgpt); +- return (float)ComputeVectorNorm(proj, _cuBufferData); +-} +- +-float SparseBundleCU::EvaluateProjectionX(CuTexImage& cam, CuTexImage& point, +- CuTexImage& proj) { +- ++__num_projection_eval; +- ConfigBA::TimerBA timer(this, TIMER_FUNCTION_PJ, true); +- ComputeProjectionX(cam, point, _cuMeasurements, _cuProjectionMap, proj, +- __use_radial_distortion); +- if (_num_imgpt_q > 0) +- ComputeProjectionQ(cam, _cuCameraQMap, _cuCameraQMapW, proj, _num_imgpt); +- return (float)ComputeVectorNorm(proj, _cuBufferData); +-} +- +-void SparseBundleCU::DebugProjections() { +- double e1 = 0, e2 = 0; +- for (int i = 0; i < _num_imgpt; ++i) { +- float* c = (float*)(_camera_data + _camera_idx[i]); +- float* p = _point_data + 4 * _point_idx[i]; +- const float* m = _imgpt_datax.size() > 0 ? (&_imgpt_datax[i * 2]) +- : (_imgpt_data + 2 * i); +- float* r = c + 4; +- float* t = c + 1; +- float dx1, dy1; +- //////////////////////////////////////////////////////////////////////////////// +- float z = r[6] * p[0] + r[7] * p[1] + r[8] * p[2] + t[2]; +- float xx = (r[0] * p[0] + r[1] * p[1] + r[2] * p[2] + t[0]); +- float yy = (r[3] * p[0] + r[4] * p[1] + r[5] * p[2] + t[1]); +- float x = xx / z; +- float y = yy / z; +- if (__use_radial_distortion == -1) { +- float rn = (m[0] * m[0] + m[1] * m[1]) * c[13] + 1.0f; +- dx1 = c[0] * x - m[0] * rn; +- dy1 = c[0] * y - m[1] * rn; +- e1 += (dx1 * dx1 + dy1 * dy1); +- e2 += (dx1 * dx1 + dy1 * dy1) / (rn * rn); +- } else if (__use_radial_distortion) { +- float rn = (x * x + y * y) * c[13] + 1.0f; +- dx1 = c[0] * x * rn - m[0]; +- dy1 = c[0] * y * rn - m[1]; +- e1 += (dx1 * dx1 + dy1 * dy1) / (rn * rn); +- e2 += (dx1 * dx1 + dy1 * dy1); +- } else { +- dx1 = c[0] * x - m[0]; +- dy1 = c[0] * y - m[1]; +- e1 += (dx1 * dx1 + dy1 * dy1); +- e2 += (dx1 * dx1 + dy1 * dy1); +- } +- if (!isfinite(dx1) || !isfinite(dy1)) +- std::cout << "x = " << xx << " y = " << yy << " z = " << z << '\n' +- << "t0 = " << t[0] << " t1 = " << t[1] << " t2 = " << t[2] +- << '\n' << "p0 = " << p[0] << " p1 = " << p[1] +- << " p2 = " << p[2] << '\n'; +- } +- e1 = e1 / (__focal_scaling * __focal_scaling) / _num_imgpt; +- e2 = e2 / (__focal_scaling * __focal_scaling) / _num_imgpt; +- std::cout << "DEBUG: mean squared error = " << e1 +- << " in undistorted domain;\n"; +- std::cout << "DEBUG: mean squared error = " << e2 +- << " in distorted domain.\n"; +-} +- +-bool SparseBundleCU::InitializeStorageForCG() { +- bool success = true; +- size_t total_sz = 0; +- int plen = GetParameterLength(); // q = 8m + 4n +- +- //////////////////////////////////////////// 6q +- ALLOCATE_REQUIRED_DATA(_cuVectorJtE, plen, 1); +- ALLOCATE_REQUIRED_DATA(_cuVectorXK, plen, 1); +- ALLOCATE_REQUIRED_DATA(_cuVectorPK, plen, 1); +- ALLOCATE_REQUIRED_DATA(_cuVectorRK, plen, 1); +- ALLOCATE_REQUIRED_DATA(_cuVectorJJ, plen, 1); +- ALLOCATE_REQUIRED_DATA(_cuVectorZK, plen, 1); +- +- ///////////////////////////////// +- unsigned int cblock_len = (__use_radial_distortion ? 64 : 56); +- ALLOCATE_REQUIRED_DATA(_cuBlockPC, _num_camera * cblock_len + 12 * _num_point, +- 1); // 64m + 12n +- if (__accurate_gain_ratio) { +- ALLOCATE_REQUIRED_DATA(_cuVectorJX, _num_imgpt + _num_imgpt_q, 2); // 2k +- } else { +- _cuVectorJX.SetTexture(_cuImageProj.data(), _num_imgpt + _num_imgpt_q, 2); +- } +- ALLOCATE_OPTIONAL_DATA(_cuVectorSJ, plen, 1, __jacobian_normalize); +- +- ///////////////////////////////////////// +- __memory_usage += total_sz; +- if (__verbose_level > 1) +- std::cout << "Memory for Conjugate Gradient Solver:\t" +- << (total_sz / 1024 / 1024) << "MB\n"; +- return success; +-} +- +-void SparseBundleCU::PrepareJacobianNormalization() { +- if (!_cuVectorSJ.IsValid()) return; +- +- if ((__jc_store_transpose || __jc_store_original) && +- _cuJacobianPoint.IsValid() && !__bundle_current_mode) { +- CuTexImage null; +- null.SwapData(_cuVectorSJ); +- EvaluateJacobians(); +- null.SwapData(_cuVectorSJ); +- ComputeDiagonal(_cuVectorJJ, _cuVectorSJ); +- ComputeSQRT(_cuVectorSJ); +- } else { +- CuTexImage null; +- null.SwapData(_cuVectorSJ); +- EvaluateJacobians(); +- ComputeBlockPC(0, true); +- null.SwapData(_cuVectorSJ); +- _cuVectorJJ.SwapData(_cuVectorSJ); +- ProgramCU::ComputeRSQRT(_cuVectorSJ); +- } +-} +- +-void SparseBundleCU::EvaluateJacobians(bool shuffle) { +- if (__no_jacobian_store) return; +- if (__bundle_current_mode == BUNDLE_ONLY_MOTION && !__jc_store_original && +- !__jc_store_transpose) +- return; +- ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JJ, true); +- +- if (__jc_store_original || !__jc_store_transpose) { +- ComputeJacobian(_cuCameraData, _cuPointData, _cuJacobianCamera, +- _cuJacobianPoint, _cuProjectionMap, _cuVectorSJ, +- _cuMeasurements, _cuCameraMeasurementList, +- __fixed_intrinsics, __use_radial_distortion, false); +- if (shuffle && __jc_store_transpose && _cuJacobianCameraT.IsValid()) +- ShuffleCameraJacobian(_cuJacobianCamera, _cuCameraMeasurementList, +- _cuJacobianCameraT); +- } else { +- ComputeJacobian(_cuCameraData, _cuPointData, _cuJacobianCameraT, +- _cuJacobianPoint, _cuProjectionMap, _cuVectorSJ, +- _cuMeasurements, _cuCameraMeasurementListT, +- __fixed_intrinsics, __use_radial_distortion, true); +- } +- ++__num_jacobian_eval; +-} +- +-void SparseBundleCU::ComputeJtE(CuTexImage& E, CuTexImage& JtE, int mode) { +- ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JTE, true); +- if (mode == 0) mode = __bundle_current_mode; +- if (__no_jacobian_store || (!__jc_store_original && !__jc_store_transpose)) { +- ProgramCU::ComputeJtE_(E, JtE, _cuCameraData, _cuPointData, _cuMeasurements, +- _cuCameraMeasurementMap, _cuCameraMeasurementList, +- _cuPointMeasurementMap, _cuProjectionMap, +- _cuJacobianPoint, __fixed_intrinsics, +- __use_radial_distortion, mode); +- +- //////////////////////////////////////////////////////////////////////////////////// +- if (!_cuVectorSJ.IsValid()) { +- } else if (mode == 2) { +- if (!_cuJacobianPoint.IsValid()) +- ComputeVXY(JtE, _cuVectorSJ, JtE, _num_point * 4, _num_camera * 8); +- } else if (mode == 1) +- ComputeVXY(JtE, _cuVectorSJ, JtE, _num_camera * 8); +- else +- ComputeVXY(JtE, _cuVectorSJ, JtE, +- _cuJacobianPoint.IsValid() ? _num_camera * 8 : 0); +- +- } else if (__jc_store_transpose) { +- ProgramCU::ComputeJtE(E, _cuJacobianCameraT, _cuCameraMeasurementMap, +- _cuCameraMeasurementList, _cuJacobianPoint, +- _cuPointMeasurementMap, JtE, true, mode); +- } else { +- ProgramCU::ComputeJtE(E, _cuJacobianCamera, _cuCameraMeasurementMap, +- _cuCameraMeasurementList, _cuJacobianPoint, +- _cuPointMeasurementMap, JtE, false, mode); +- } +- +- if (mode != 2 && _num_imgpt_q > 0) +- ProgramCU::ComputeJQtEC(E, _cuCameraQList, _cuCameraQListW, _cuVectorSJ, +- JtE); +-} +- +-void SparseBundleCU::SaveBundleRecord(int iter, float res, float damping, +- float& g_norm, float& g_inf) { +- // do not really compute if parameter not specified... +- // for large dataset, it never converges.. +- g_inf = +- __lm_check_gradient ? ComputeVectorMax(_cuVectorJtE, _cuBufferData) : 0; +- g_norm = __save_gradient_norm +- ? float(ComputeVectorNorm(_cuVectorJtE, _cuBufferData)) +- : g_inf; +- ConfigBA::SaveBundleRecord(iter, res, damping, g_norm, g_inf); +-} +- +-void SparseBundleCU::ComputeJX(CuTexImage& X, CuTexImage& JX, int mode) { +- ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JX, true); +- if (__no_jacobian_store || (__multiply_jx_usenoj && mode != 2) || +- !__jc_store_original) { +- if (_cuVectorSJ.IsValid()) { +- if (mode == 0) +- ProgramCU::ComputeVXY(X, _cuVectorSJ, _cuVectorZK); +- else if (mode == 1) +- ProgramCU::ComputeVXY(X, _cuVectorSJ, _cuVectorZK, _num_camera * 8); +- else if (mode == 2) +- ProgramCU::ComputeVXY(X, _cuVectorSJ, _cuVectorZK, _num_point * 4, +- _num_camera * 8); +- ProgramCU::ComputeJX_(_cuVectorZK, JX, _cuCameraData, _cuPointData, +- _cuMeasurements, _cuProjectionMap, +- __fixed_intrinsics, __use_radial_distortion, mode); +- } else { +- ProgramCU::ComputeJX_(X, JX, _cuCameraData, _cuPointData, _cuMeasurements, +- _cuProjectionMap, __fixed_intrinsics, +- __use_radial_distortion, mode); +- } +- } else { +- ProgramCU::ComputeJX(_num_camera * 2, X, _cuJacobianCamera, +- _cuJacobianPoint, _cuProjectionMap, JX, mode); +- } +- +- if (_num_imgpt_q > 0 && mode != 2) { +- ProgramCU::ComputeJQX(X, _cuCameraQMap, _cuCameraQMapW, _cuVectorSJ, JX, +- _num_imgpt); +- } +-} +- +-void SparseBundleCU::ComputeBlockPC(float lambda, bool dampd) { +- ConfigBA::TimerBA timer(this, TIMER_FUNCTION_BC, true); +- +- bool use_diagonal_q = _cuCameraQListW.IsValid() && __bundle_current_mode != 2; +- if (use_diagonal_q) +- ComputeDiagonalQ(_cuCameraQListW, _cuVectorSJ, _cuVectorJJ); +- +- if (__no_jacobian_store || (!__jc_store_original && !__jc_store_transpose)) { +- ComputeDiagonalBlock_( +- lambda, dampd, _cuCameraData, _cuPointData, _cuMeasurements, +- _cuCameraMeasurementMap, _cuCameraMeasurementList, +- _cuPointMeasurementMap, _cuProjectionMap, _cuJacobianPoint, _cuVectorSJ, +- _cuVectorJJ, _cuBlockPC, __fixed_intrinsics, __use_radial_distortion, +- use_diagonal_q, __bundle_current_mode); +- } else if (__jc_store_transpose) { +- ComputeDiagonalBlock(lambda, dampd, _cuJacobianCameraT, +- _cuCameraMeasurementMap, _cuJacobianPoint, +- _cuPointMeasurementMap, _cuCameraMeasurementList, +- _cuVectorJJ, _cuBlockPC, __use_radial_distortion, true, +- use_diagonal_q, __bundle_current_mode); +- } else { +- ComputeDiagonalBlock(lambda, dampd, _cuJacobianCamera, +- _cuCameraMeasurementMap, _cuJacobianPoint, +- _cuPointMeasurementMap, _cuCameraMeasurementList, +- _cuVectorJJ, _cuBlockPC, __use_radial_distortion, +- false, use_diagonal_q, __bundle_current_mode); +- } +-} +- +-void SparseBundleCU::ApplyBlockPC(CuTexImage& v, CuTexImage& pv, int mode) { +- ConfigBA::TimerBA timer(this, TIMER_FUNCTION_MP, true); +- MultiplyBlockConditioner(_num_camera, _num_point, _cuBlockPC, v, pv, +- __use_radial_distortion, mode); +-} +- +-void SparseBundleCU::ComputeDiagonal(CuTexImage& JJ, CuTexImage& JJI) { +- ////////////////////checking the impossible. +- if (__no_jacobian_store) return; +- if (!__jc_store_transpose && !__jc_store_original) return; +- +- ConfigBA::TimerBA timer(this, TIMER_FUNCTION_DD, true); +- bool use_diagonal_q = _cuCameraQListW.IsValid(); +- if (use_diagonal_q) { +- CuTexImage null; +- ComputeDiagonalQ(_cuCameraQListW, null, JJ); +- } +- if (__jc_store_transpose) { +- ProgramCU::ComputeDiagonal(_cuJacobianCameraT, _cuCameraMeasurementMap, +- _cuJacobianPoint, _cuPointMeasurementMap, +- _cuCameraMeasurementList, JJ, JJI, true, +- __use_radial_distortion, use_diagonal_q); +- } else { +- ProgramCU::ComputeDiagonal(_cuJacobianCamera, _cuCameraMeasurementMap, +- _cuJacobianPoint, _cuPointMeasurementMap, +- _cuCameraMeasurementList, JJ, JJI, false, +- __use_radial_distortion, use_diagonal_q); +- } +-} +- +-int SparseBundleCU::SolveNormalEquationPCGX(float lambda) { +- //---------------------------------------------------------- +- //(Jt * J + lambda * diag(Jt * J)) X = Jt * e +- //------------------------------------------------------------- +- TimerBA timer(this, TIMER_CG_ITERATION); +- __recent_cg_status = ' '; +- +- // diagonal for jacobian preconditioning... +- int plen = GetParameterLength(); +- CuTexImage null; +- CuTexImage& VectorDP = +- __lm_use_diagonal_damp ? _cuVectorJJ : null; // diagonal +- ComputeBlockPC(lambda, __lm_use_diagonal_damp); +- +- /////////////////////////////////////////////////////// +- // B = [BC 0 ; 0 BP] +- // m = [mc 0; 0 mp]; +- // A x= BC * x - JcT * Jp * mp * JpT * Jc * x +- // = JcT * Jc x + lambda * D * x + ........ +- //////////////////////////////////////////////////////////// +- +- CuTexImage r; +- r.SetTexture(_cuVectorRK.data(), 8 * _num_camera); +- CuTexImage p; +- p.SetTexture(_cuVectorPK.data(), 8 * _num_camera); +- CuTexImage z; +- z.SetTexture(_cuVectorZK.data(), 8 * _num_camera); +- CuTexImage x; +- x.SetTexture(_cuVectorXK.data(), 8 * _num_camera); +- CuTexImage d; +- d.SetTexture(VectorDP.data(), 8 * _num_camera); +- +- CuTexImage& u = _cuVectorRK; +- CuTexImage& v = _cuVectorPK; +- CuTexImage up; +- up.SetTexture(u.data() + 8 * _num_camera, 4 * _num_point); +- CuTexImage vp; +- vp.SetTexture(v.data() + 8 * _num_camera, 4 * _num_point); +- CuTexImage uc; +- uc.SetTexture(z.data(), 8 * _num_camera); +- +- CuTexImage& e = _cuVectorJX; +- CuTexImage& e2 = _cuImageProj; +- +- ApplyBlockPC(_cuVectorJtE, u, 2); +- ComputeJX(u, e, 2); +- ComputeJtE(e, uc, 1); +- ComputeSAXPY(-1.0f, uc, _cuVectorJtE, r); // r +- ApplyBlockPC(r, p, 1); // z = p = M r +- +- float_t rtz0 = (float_t)ComputeVectorDot(r, p, _cuBufferData); // r(0)' * +- // z(0) +- ComputeJX(p, e, 1); // Jc * x +- ComputeJtE(e, u, 2); // JpT * jc * x +- ApplyBlockPC(u, v, 2); +- float_t qtq0 = (float_t)ComputeVectorNorm(e, _cuBufferData); // q(0)' * q(0) +- float_t pdp0 = +- (float_t)ComputeVectorNormW(p, d, _cuBufferData); // p(0)' * DDD * p(0) +- float_t uv0 = (float_t)ComputeVectorDot(up, vp, _cuBufferData); +- float_t alpha0 = rtz0 / (qtq0 + lambda * pdp0 - uv0); +- +- if (__verbose_cg_iteration) +- std::cout << " --0,\t alpha = " << alpha0 +- << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n"; +- if (!isfinite(alpha0)) { +- return 0; +- } +- if (alpha0 == 0) { +- __recent_cg_status = 'I'; +- return 1; +- } +- +- //////////////////////////////////////////////////////////// +- ComputeSAX((float)alpha0, p, x); // x(k+1) = x(k) + a(k) * p(k) +- ComputeJX(v, e2, 2); // //Jp * mp * JpT * JcT * p +- ComputeSAXPY(-1.0f, e2, e, e); +- ComputeJtE(e, uc, 1); // JcT * .... +- ComputeSXYPZ(lambda, d, p, uc, uc); +- ComputeSAXPY((float)-alpha0, uc, r, r); // r(k + 1) = r(k) - a(k) * A * pk +- +- ////////////////////////////////////////////////////////////////////////// +- float_t rtzk = rtz0, rtz_min = rtz0, betak; +- int iteration = 1; +- ++__num_cg_iteration; +- +- while (true) { +- ApplyBlockPC(r, z, 1); +- +- /////////////////////////////////////////////////////////////////////////// +- float_t rtzp = rtzk; +- rtzk = (float_t)ComputeVectorDot( +- r, z, _cuBufferData); //[r(k + 1) = M^(-1) * z(k + 1)] * z(k+1) +- float_t rtz_ratio = sqrt(fabs(rtzk / rtz0)); +- +- if (rtz_ratio < __cg_norm_threshold) { +- if (__recent_cg_status == ' ') +- __recent_cg_status = iteration < std::min(10, __cg_min_iteration) +- ? '0' + iteration +- : 'N'; +- if (iteration >= __cg_min_iteration) break; +- } +- //////////////////////////////////////////////////////////////////////////// +- betak = rtzk / rtzp; // beta +- rtz_min = std::min(rtz_min, rtzk); +- +- ComputeSAXPY((float)betak, p, z, p); // p(k) = z(k) + b(k) * p(k - 1) +- ComputeJX(p, e, 1); // Jc * p +- ComputeJtE(e, u, 2); // JpT * jc * p +- ApplyBlockPC(u, v, 2); +- ////////////////////////////////////////////////////////////////////// +- +- float_t qtqk = (float_t)ComputeVectorNorm(e, _cuBufferData); // q(k)' q(k) +- float_t pdpk = +- (float_t)ComputeVectorNormW(p, d, _cuBufferData); // p(k)' * DDD * p(k) +- float_t uvk = (float_t)ComputeVectorDot(up, vp, _cuBufferData); +- float_t alphak = rtzk / (qtqk + lambda * pdpk - uvk); +- +- ///////////////////////////////////////////////////// +- if (__verbose_cg_iteration) +- std::cout << " --" << iteration << ",\t alpha= " << alphak +- << ", rtzk/rtz0 = " << rtz_ratio +- << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n"; +- +- /////////////////////////////////////////////////// +- if (!isfinite(alphak) || rtz_ratio > __cg_norm_guard) { +- __recent_cg_status = 'X'; +- break; +- } // something doesn't converge.. +- +- //////////////////////////////////////////////// +- ComputeSAXPY((float)alphak, p, x, x); // x(k+1) = x(k) + a(k) * p(k) +- +- ///////////////////////////////////////////////// +- ++iteration; +- ++__num_cg_iteration; +- if (iteration >= std::min(__cg_max_iteration, plen)) break; +- +- ComputeJX(v, e2, 2); // //Jp * mp * JpT * JcT * p +- ComputeSAXPY(-1.0f, e2, e, e); +- ComputeJtE(e, uc, 1); // JcT * .... +- ComputeSXYPZ(lambda, d, p, uc, uc); +- ComputeSAXPY((float)-alphak, uc, r, r); // r(k + 1) = r(k) - a(k) * A * pk +- } +- +- // if(__recent_cg_status == 'X') return iteration; +- +- ComputeJX(x, e, 1); +- ComputeJtE(e, u, 2); +- CuTexImage jte_p; +- jte_p.SetTexture(_cuVectorJtE.data() + 8 * _num_camera, _num_point * 4); +- ComputeSAXPY(-1.0f, up, jte_p, vp); +- ApplyBlockPC(v, _cuVectorXK, 2); +- return iteration; +-} +-int SparseBundleCU::SolveNormalEquationPCGB(float lambda) { +- //---------------------------------------------------------- +- //(Jt * J + lambda * diag(Jt * J)) X = Jt * e +- //------------------------------------------------------------- +- TimerBA timer(this, TIMER_CG_ITERATION); +- __recent_cg_status = ' '; +- +- // diagonal for jacobian preconditioning... +- int plen = GetParameterLength(); +- CuTexImage null; +- CuTexImage& VectorDP = +- __lm_use_diagonal_damp ? _cuVectorJJ : null; // diagonal +- CuTexImage& VectorQK = _cuVectorZK; // temporary +- ComputeBlockPC(lambda, __lm_use_diagonal_damp); +- +- //////////////////////////////////////////////////////// +- ApplyBlockPC(_cuVectorJtE, +- _cuVectorPK); // z(0) = p(0) = M * r(0)//r(0) = Jt * e +- ComputeJX(_cuVectorPK, _cuVectorJX); // q(0) = J * p(0) +- +- ////////////////////////////////////////////////// +- float_t rtz0 = (float_t)ComputeVectorDot(_cuVectorJtE, _cuVectorPK, +- _cuBufferData); // r(0)' * z(0) +- float_t qtq0 = +- (float_t)ComputeVectorNorm(_cuVectorJX, _cuBufferData); // q(0)' * q(0) +- float_t ptdp0 = (float_t)ComputeVectorNormW( +- _cuVectorPK, VectorDP, _cuBufferData); // p(0)' * DDD * p(0) +- float_t alpha0 = rtz0 / (qtq0 + lambda * ptdp0); +- +- if (__verbose_cg_iteration) +- std::cout << " --0,\t alpha = " << alpha0 +- << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n"; +- if (!isfinite(alpha0)) { +- return 0; +- } +- if (alpha0 == 0) { +- __recent_cg_status = 'I'; +- return 1; +- } +- +- //////////////////////////////////////////////////////////// +- ComputeSAX((float)alpha0, _cuVectorPK, +- _cuVectorXK); // x(k+1) = x(k) + a(k) * p(k) +- ComputeJtE(_cuVectorJX, VectorQK); // Jt * (J * p0) +- +- ComputeSXYPZ(lambda, VectorDP, _cuVectorPK, VectorQK, +- VectorQK); // Jt * J * p0 + lambda * DDD * p0 +- ComputeSAXPY( +- (float)-alpha0, VectorQK, _cuVectorJtE, +- _cuVectorRK); // r(k+1) = r(k) - a(k) * (Jt * q(k) + DDD * p(k)) ; +- +- float_t rtzk = rtz0, rtz_min = rtz0, betak; +- int iteration = 1; +- ++__num_cg_iteration; +- +- while (true) { +- ApplyBlockPC(_cuVectorRK, _cuVectorZK); +- +- /////////////////////////////////////////////////////////////////////////// +- float_t rtzp = rtzk; +- rtzk = (float_t)ComputeVectorDot( +- _cuVectorRK, _cuVectorZK, +- _cuBufferData); //[r(k + 1) = M^(-1) * z(k + 1)] * z(k+1) +- float_t rtz_ratio = sqrt(fabs(rtzk / rtz0)); +- if (rtz_ratio < __cg_norm_threshold) { +- if (__recent_cg_status == ' ') +- __recent_cg_status = iteration < std::min(10, __cg_min_iteration) +- ? '0' + iteration +- : 'N'; +- if (iteration >= __cg_min_iteration) break; +- } +- +- //////////////////////////////////////////////////////////////////////////// +- betak = rtzk / rtzp; // beta +- rtz_min = std::min(rtz_min, rtzk); +- +- ComputeSAXPY((float)betak, _cuVectorPK, _cuVectorZK, +- _cuVectorPK); // p(k) = z(k) + b(k) * p(k - 1) +- ComputeJX(_cuVectorPK, _cuVectorJX); // q(k) = J * p(k) +- ////////////////////////////////////////////////////////////////////// +- +- float_t qtqk = +- (float_t)ComputeVectorNorm(_cuVectorJX, _cuBufferData); // q(k)' q(k) +- float_t ptdpk = (float_t)ComputeVectorNormW( +- _cuVectorPK, VectorDP, _cuBufferData); // p(k)' * DDD * p(k) +- float_t alphak = rtzk / (qtqk + lambda * ptdpk); +- +- ///////////////////////////////////////////////////// +- if (__verbose_cg_iteration) +- std::cout << " --" << iteration << ",\t alpha= " << alphak +- << ", rtzk/rtz0 = " << rtz_ratio +- << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n"; +- +- /////////////////////////////////////////////////// +- if (!isfinite(alphak) || rtz_ratio > __cg_norm_guard) { +- __recent_cg_status = 'X'; +- break; +- } // something doesn't converge.. +- +- //////////////////////////////////////////////// +- ComputeSAXPY((float)alphak, _cuVectorPK, _cuVectorXK, +- _cuVectorXK); // x(k+1) = x(k) + a(k) * p(k) +- +- ///////////////////////////////////////////////// +- ++iteration; +- ++__num_cg_iteration; +- if (iteration >= std::min(__cg_max_iteration, plen)) break; +- +- // if(iteration == 2 && rtz_ratio < __cg_norm_threshold) +- if (__cg_recalculate_freq > 0 && iteration % __cg_recalculate_freq == 0) { +- ////r = JtE - (Jt J + lambda * D) x +- ComputeJX(_cuVectorXK, _cuVectorJX); +- ComputeJtE(_cuVectorJX, VectorQK); +- ComputeSXYPZ(lambda, VectorDP, _cuVectorXK, VectorQK, VectorQK); +- ComputeSAXPY(-1.0f, VectorQK, _cuVectorJtE, _cuVectorRK); +- } else { +- ComputeJtE(_cuVectorJX, VectorQK); +- ComputeSXYPZ(lambda, VectorDP, _cuVectorPK, VectorQK, VectorQK); // +- ComputeSAXPY( +- (float)-alphak, VectorQK, _cuVectorRK, +- _cuVectorRK); // r(k+1) = r(k) - a(k) * (Jt * q(k) + DDD * p(k)) ; +- } +- } +- return iteration; +-} +- +-int SparseBundleCU::SolveNormalEquation(float lambda) { +- if (__bundle_current_mode == BUNDLE_ONLY_MOTION) { +- ComputeBlockPC(lambda, __lm_use_diagonal_damp); +- ApplyBlockPC(_cuVectorJtE, _cuVectorXK, 1); +- return 1; +- } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) { +- ComputeBlockPC(lambda, __lm_use_diagonal_damp); +- ApplyBlockPC(_cuVectorJtE, _cuVectorXK, 2); +- return 1; +- } else { +- ////solve linear system using Conjugate Gradients +- return __cg_schur_complement ? SolveNormalEquationPCGX(lambda) +- : SolveNormalEquationPCGB(lambda); +- } +-} +- +-void SparseBundleCU::RunTestIterationLM(bool reduced) { +- EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj); +- EvaluateJacobians(); +- ComputeJtE(_cuImageProj, _cuVectorJtE); +- if (reduced) +- SolveNormalEquationPCGX(__lm_initial_damp); +- else +- SolveNormalEquationPCGB(__lm_initial_damp); +- UpdateCameraPoint(_cuVectorZK, _cuImageProj); +- ComputeVectorDot(_cuVectorXK, _cuVectorJtE, _cuBufferData); +- ComputeJX(_cuVectorXK, _cuVectorJX); +- ComputeVectorNorm(_cuVectorJX, _cuBufferData); +-} +- +-float SparseBundleCU::UpdateCameraPoint(CuTexImage& dx, +- CuTexImage& cuImageTempProj) { +- ConfigBA::TimerBA timer(this, TIMER_FUNCTION_UP, true); +- if (__bundle_current_mode == BUNDLE_ONLY_MOTION) { +- if (__jacobian_normalize) +- ComputeVXY(_cuVectorXK, _cuVectorSJ, dx, 8 * _num_camera); +- ProgramCU::UpdateCameraPoint(_num_camera, _cuCameraData, _cuPointData, dx, +- _cuCameraDataEX, _cuPointDataEX, +- __bundle_current_mode); +- return EvaluateProjection(_cuCameraDataEX, _cuPointData, cuImageTempProj); +- } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) { +- if (__jacobian_normalize) +- ComputeVXY(_cuVectorXK, _cuVectorSJ, dx, 4 * _num_point, 8 * _num_camera); +- ProgramCU::UpdateCameraPoint(_num_camera, _cuCameraData, _cuPointData, dx, +- _cuCameraDataEX, _cuPointDataEX, +- __bundle_current_mode); +- return EvaluateProjection(_cuCameraData, _cuPointDataEX, cuImageTempProj); +- } else { +- if (__jacobian_normalize) ComputeVXY(_cuVectorXK, _cuVectorSJ, dx); +- ProgramCU::UpdateCameraPoint(_num_camera, _cuCameraData, _cuPointData, dx, +- _cuCameraDataEX, _cuPointDataEX, +- __bundle_current_mode); +- return EvaluateProjection(_cuCameraDataEX, _cuPointDataEX, cuImageTempProj); +- } +-} +- +-float SparseBundleCU::SaveUpdatedSystem(float residual_reduction, +- float dx_sqnorm, float damping) { +- float expected_reduction; +- if (__bundle_current_mode == BUNDLE_ONLY_MOTION) { +- CuTexImage xk; +- xk.SetTexture(_cuVectorXK.data(), 8 * _num_camera); +- CuTexImage jte; +- jte.SetTexture(_cuVectorJtE.data(), 8 * _num_camera); +- float dxtg = (float)ComputeVectorDot(xk, jte, _cuBufferData); +- if (__lm_use_diagonal_damp) { +- CuTexImage jj; +- jj.SetTexture(_cuVectorJJ.data(), 8 * _num_camera); +- float dq = (float)ComputeVectorNormW(xk, jj, _cuBufferData); +- expected_reduction = damping * dq + dxtg; +- } else { +- expected_reduction = damping * dx_sqnorm + dxtg; +- } +- _cuCameraData.SwapData(_cuCameraDataEX); +- } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) { +- CuTexImage xk; +- xk.SetTexture(_cuVectorXK.data() + 8 * _num_camera, 4 * _num_point); +- CuTexImage jte; +- jte.SetTexture(_cuVectorJtE.data() + 8 * _num_camera, 4 * _num_point); +- float dxtg = (float)ComputeVectorDot(xk, jte, _cuBufferData); +- if (__lm_use_diagonal_damp) { +- CuTexImage jj; +- jj.SetTexture(_cuVectorJJ.data() + 8 * _num_camera, 4 * _num_point); +- float dq = (float)ComputeVectorNormW(xk, jj, _cuBufferData); +- expected_reduction = damping * dq + dxtg; +- } else { +- expected_reduction = damping * dx_sqnorm + dxtg; +- } +- _cuPointData.SwapData(_cuPointDataEX); +- } else { +- float dxtg = +- (float)ComputeVectorDot(_cuVectorXK, _cuVectorJtE, _cuBufferData); +- +- if (__accurate_gain_ratio) { +- ComputeJX(_cuVectorXK, _cuVectorJX); +- float njx = (float)ComputeVectorNorm(_cuVectorJX, _cuBufferData); +- expected_reduction = 2.0f * dxtg - njx; +- // could the expected reduction be negative??? not sure +- if (expected_reduction <= 0) +- expected_reduction = 0.001f * residual_reduction; +- } else if (__lm_use_diagonal_damp) { +- float dq = +- (float)ComputeVectorNormW(_cuVectorXK, _cuVectorJJ, _cuBufferData); +- expected_reduction = damping * dq + dxtg; +- } else { +- expected_reduction = damping * dx_sqnorm + dxtg; +- } +- +- /// save the new motion/struture +- _cuCameraData.SwapData(_cuCameraDataEX); +- _cuPointData.SwapData(_cuPointDataEX); +- +- //_cuCameraData.CopyToHost(_camera_data); +- //_cuPointData.CopyToHost(_point_data); +- // DebugProjections(); +- } +- //////////////////////////////////////////// +- return float(residual_reduction / expected_reduction); +-} +- +-void SparseBundleCU::AdjustBundleAdjsutmentMode() { +- if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) { +- _cuJacobianCamera.InitTexture(0, 0); +- _cuJacobianCameraT.InitTexture(0, 0); +- } +-} +- +-float SparseBundleCU::EvaluateDeltaNorm() { +- if (__bundle_current_mode == BUNDLE_ONLY_MOTION) { +- CuTexImage temp; +- temp.SetTexture(_cuVectorXK.data(), 8 * _num_camera); +- return ComputeVectorNorm(temp, _cuBufferData); +- +- } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) { +- CuTexImage temp; +- temp.SetTexture(_cuVectorXK.data() + 8 * _num_camera, 4 * _num_point); +- return ComputeVectorNorm(temp, _cuBufferData); +- } else { +- return (float)ComputeVectorNorm(_cuVectorXK, _cuBufferData); +- } +-} +- +-void SparseBundleCU::NonlinearOptimizeLM() { +- //////////////////////////////////////// +- TimerBA timer(this, TIMER_OPTIMIZATION); +- +- //////////////////////////////////////////////// +- float mse_convert_ratio = +- 1.0f / (_num_imgpt * __focal_scaling * __focal_scaling); +- float error_display_ratio = __verbose_sse ? _num_imgpt : 1.0f; +- const int edwidth = __verbose_sse ? 12 : 8; +- _projection_sse = +- EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj); +- __initial_mse = __final_mse = _projection_sse * mse_convert_ratio; +- +- // compute jacobian diagonals for normalization +- if (__jacobian_normalize) PrepareJacobianNormalization(); +- +- // evalaute jacobian +- EvaluateJacobians(); +- ComputeJtE(_cuImageProj, _cuVectorJtE); +- /////////////////////////////////////////////////////////////// +- if (__verbose_level) +- std::cout << "Initial " << (__verbose_sse ? "sumed" : "mean") +- << " squared error = " << __initial_mse * error_display_ratio +- << "\n----------------------------------------------\n"; +- +- ////////////////////////////////////////////////// +- CuTexImage& cuImageTempProj = _cuVectorJX; +- // CuTexImage& cuVectorTempJX = _cuVectorJX; +- CuTexImage& cuVectorDX = _cuVectorSJ.IsValid() ? _cuVectorZK : _cuVectorXK; +- +- ////////////////////////////////////////////////// +- float damping_adjust = 2.0f, damping = __lm_initial_damp, g_norm, g_inf; +- SaveBundleRecord(0, _projection_sse * mse_convert_ratio, damping, g_norm, +- g_inf); +- +- //////////////////////////////////// +- std::cout << std::left; +- for (int i = 0; i < __lm_max_iteration && !__abort_flag; +- __current_iteration = (++i)) { +- ////solve linear system +- int num_cg_iteration = SolveNormalEquation(damping); +- +- // there must be NaN somewhere +- if (num_cg_iteration == 0) { +- if (__verbose_level) +- std::cout << "#" << std::setw(3) << i << " quit on numeric errors\n"; +- __pba_return_code = 'E'; +- break; +- } +- +- // there must be infinity somewhere +- if (__recent_cg_status == 'I') { +- std::cout << "#" << std::setw(3) << i << " 0 I e=" << std::setw(edwidth) +- << "------- " +- << " u=" << std::setprecision(3) << std::setw(9) << damping +- << '\n' << std::setprecision(6); +- /////////////increase damping factor +- damping = damping * damping_adjust; +- damping_adjust = 2.0f * damping_adjust; +- --i; +- continue; +- } +- +- ///////////////////// +- ++__num_lm_iteration; +- +- //////////////////////////////////// +- float dx_sqnorm = EvaluateDeltaNorm(), dx_norm = sqrt(dx_sqnorm); +- +- // In this library, we check absolute difference instead of realtive +- // difference +- if (dx_norm <= __lm_delta_threshold) { +- // damping factor must be way too big...or it converges +- if (__verbose_level > 1) +- std::cout << "#" << std::setw(3) << i << " " << std::setw(3) +- << num_cg_iteration << char(__recent_cg_status) +- << " quit on too small change (" << dx_norm << " < " +- << __lm_delta_threshold << ")\n"; +- __pba_return_code = 'S'; +- break; +- } +- /////////////////////////////////////////////////////////////////////// +- // update structure and motion, check reprojection error +- float new_residual = UpdateCameraPoint(cuVectorDX, cuImageTempProj); +- float average_residual = new_residual * mse_convert_ratio; +- float residual_reduction = _projection_sse - new_residual; +- +- // do we find a better solution? +- if (isfinite(new_residual) && residual_reduction > 0) { +- ////compute relative norm change +- float relative_reduction = 1.0f - (new_residual / _projection_sse); +- +- //////////////////////////////////// +- __num_lm_success++; // increase counter +- _projection_sse = new_residual; // save the new residual +- _cuImageProj.SwapData(cuImageTempProj); // save the new projection +- +- ///////////////gain ratio//////////////////// +- float gain_ratio = +- SaveUpdatedSystem(residual_reduction, dx_sqnorm, damping); +- +- ///////////////////////////////////// +- SaveBundleRecord(i + 1, _projection_sse * mse_convert_ratio, damping, +- g_norm, g_inf); +- +- ///////////////////////////////////////////// +- if (__verbose_level > 1) +- std::cout << "#" << std::setw(3) << i << " " << std::setw(3) +- << num_cg_iteration << char(__recent_cg_status) +- << " e=" << std::setw(edwidth) +- << average_residual * error_display_ratio +- << " u=" << std::setprecision(3) << std::setw(9) << damping +- << " r=" << std::setw(6) +- << floor(gain_ratio * 1000.f) * 0.001f +- << " g=" << std::setw(g_norm > 0 ? 9 : 1) << g_norm << " " +- << std::setw(9) << relative_reduction << ' ' << std::setw(9) +- << dx_norm << " t=" << int(BundleTimerGetNow()) << "\n" +- << std::setprecision(6); +- +- ///////////////////////////// +- if (!IsTimeBudgetAvailable()) { +- if (__verbose_level > 1) +- std::cout << "#" << std::setw(3) << i << " used up time budget.\n"; +- __pba_return_code = 'T'; +- break; +- } else if (__lm_check_gradient && g_inf < __lm_gradient_threshold) { +- if (__verbose_level > 1) +- std::cout << "#" << std::setw(3) << i +- << " converged with small gradient\n"; +- __pba_return_code = 'G'; +- break; +- } else if (average_residual * error_display_ratio <= __lm_mse_threshold) { +- if (__verbose_level > 1) +- std::cout << "#" << std::setw(3) << i << " satisfies MSE threshold\n"; +- __pba_return_code = 'M'; +- break; +- } else { +- /////////////////////////////adjust damping factor +- float temp = gain_ratio * 2.0f - 1.0f; +- float adaptive_adjust = 1.0f - temp * temp * temp; // powf(, 3.0f); // +- float auto_adjust = std::max(1.0f / 3.0f, adaptive_adjust); +- +- ////////////////////////////////////////////////// +- damping = damping * auto_adjust; +- damping_adjust = 2.0f; +- if (damping < __lm_minimum_damp) +- damping = __lm_minimum_damp; +- else if (__lm_damping_auto_switch == 0 && damping > __lm_maximum_damp && +- __lm_use_diagonal_damp) +- damping = __lm_maximum_damp; +- +- EvaluateJacobians(); +- ComputeJtE(_cuImageProj, _cuVectorJtE); +- } +- } else { +- if (__verbose_level > 1) +- std::cout << "#" << std::setw(3) << i << " " << std::setw(3) +- << num_cg_iteration << char(__recent_cg_status) +- << " e=" << std::setw(edwidth) << std::left +- << average_residual * error_display_ratio +- << " u=" << std::setprecision(3) << std::setw(9) << damping +- << " r=----- " << (__lm_check_gradient || __save_gradient_norm +- ? " g=---------" +- : " g=0") +- << " --------- " << std::setw(9) << dx_norm +- << " t=" << int(BundleTimerGetNow()) << "\n" +- << std::setprecision(6); +- +- if (__lm_damping_auto_switch > 0 && __lm_use_diagonal_damp && +- damping > __lm_damping_auto_switch) { +- __lm_use_diagonal_damp = false; +- damping = __lm_damping_auto_switch; +- damping_adjust = 2.0f; +- if (__verbose_level > 1) +- std::cout << "NOTE: switch to damping with an identity matix\n"; +- } else { +- /////////////increase damping factor +- damping = damping * damping_adjust; +- damping_adjust = 2.0f * damping_adjust; +- } +- } +- +- if (__verbose_level == 1) std::cout << '.'; +- } +- +- __final_mse = float(_projection_sse * mse_convert_ratio); +- __final_mse_x = +- __use_radial_distortion +- ? EvaluateProjectionX(_cuCameraData, _cuPointData, _cuImageProj) * +- mse_convert_ratio +- : __final_mse; +-} +- +-#define PROFILE_(A, B) \ +- BundleTimerStart(TIMER_PROFILE_STEP); \ +- for (int i = 0; i < repeat; ++i) { \ +- B; \ +- FinishWorkCUDA(); \ +- } \ +- BundleTimerSwitch(TIMER_PROFILE_STEP); \ +- std::cout << std::setw(24) << A << ": " \ +- << (BundleTimerGet(TIMER_PROFILE_STEP) / repeat) << "\n"; +- +-#define PROFILE(A, B) PROFILE_(#A, A B) +-#define PROXILE(A, B) PROFILE_(A, B) +- +-void SparseBundleCU::RunProfileSteps() { +- const int repeat = __profile_pba; +- std::cout << "---------------------------------\n" +- "| Run profiling steps (" +- << repeat << ") |\n" +- "---------------------------------\n" +- << std::left; +- ; +- +- /////////////////////////////////////////////// +- PROXILE("Upload Measurements", +- _cuMeasurements.CopyFromHost( +- _imgpt_datax.size() > 0 ? &_imgpt_datax[0] : _imgpt_data)); +- PROXILE("Upload Point Data", _cuPointData.CopyToHost(_point_data)); +- std::cout << "---------------------------------\n"; +- +- ///////////////////////////////////////////// +- EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj); +- PrepareJacobianNormalization(); +- EvaluateJacobians(); +- ComputeJtE(_cuImageProj, _cuVectorJtE); +- ComputeBlockPC(__lm_initial_damp, true); +- FinishWorkCUDA(); +- +- do { +- if (SolveNormalEquationPCGX(__lm_initial_damp) == 10 && +- SolveNormalEquationPCGB(__lm_initial_damp) == 10) +- break; +- __lm_initial_damp *= 2.0f; +- } while (__lm_initial_damp < 1024.0f); +- std::cout << "damping set to " << __lm_initial_damp << " for profiling\n" +- << "---------------------------------\n"; +- +- { +- int repeat = 10, cgmin = __cg_min_iteration, cgmax = __cg_max_iteration; +- __cg_max_iteration = __cg_min_iteration = 10; +- __num_cg_iteration = 0; +- PROFILE(SolveNormalEquationPCGX, (__lm_initial_damp)); +- if (__num_cg_iteration != 100) +- std::cout << __num_cg_iteration << " cg iterations in all\n"; +- +- ///////////////////////////////////////////////////////////////////// +- __num_cg_iteration = 0; +- PROFILE(SolveNormalEquationPCGB, (__lm_initial_damp)); +- if (__num_cg_iteration != 100) +- std::cout << __num_cg_iteration << " cg iterations in all\n"; +- std::cout << "---------------------------------\n"; +- ////////////////////////////////////////////////////// +- __num_cg_iteration = 0; +- PROXILE("Single iteration LMX", RunTestIterationLM(true)); +- if (__num_cg_iteration != 100) +- std::cout << __num_cg_iteration << " cg iterations in all\n"; +- //////////////////////////////////////////////////////// +- __num_cg_iteration = 0; +- PROXILE("Single iteration LMB", RunTestIterationLM(false)); +- if (__num_cg_iteration != 100) +- std::cout << __num_cg_iteration << " cg iterations in all\n"; +- std::cout << "---------------------------------\n"; +- __cg_max_iteration = cgmax; +- __cg_min_iteration = cgmin; +- } +- ///////////////////////////////////////////////////// +- PROFILE(UpdateCameraPoint, (_cuVectorZK, _cuImageProj)); +- PROFILE(ComputeVectorNorm, (_cuVectorXK, _cuBufferData)); +- PROFILE(ComputeVectorDot, (_cuVectorXK, _cuVectorRK, _cuBufferData)); +- PROFILE(ComputeVectorNormW, (_cuVectorXK, _cuVectorRK, _cuBufferData)); +- PROFILE(ComputeSAXPY, (0.01f, _cuVectorXK, _cuVectorRK, _cuVectorZK)); +- PROFILE(ComputeSXYPZ, +- (0.01f, _cuVectorXK, _cuVectorPK, _cuVectorRK, _cuVectorZK)); +- std::cout << "---------------------------------\n"; +- PROFILE(ComputeVectorNorm, (_cuImageProj, _cuBufferData)); +- PROFILE(ComputeSAXPY, (0.000f, _cuImageProj, _cuVectorJX, _cuVectorJX)); +- std::cout << "---------------------------------\n"; +- +- __multiply_jx_usenoj = false; +- /////////////////////////////////////////////////////// +- PROFILE(EvaluateProjection, (_cuCameraData, _cuPointData, _cuImageProj)); +- PROFILE(ApplyBlockPC, (_cuVectorJtE, _cuVectorPK)); +- ///////////////////////////////////////////////// +- if (!__no_jacobian_store) { +- if (__jc_store_original) { +- PROFILE(ComputeJX, (_cuVectorJtE, _cuVectorJX)); +- PROFILE(EvaluateJacobians, (false)); +- +- if (__jc_store_transpose) { +- PROFILE( +- ShuffleCameraJacobian, +- (_cuJacobianCamera, _cuCameraMeasurementList, _cuJacobianCameraT)); +- PROFILE(ComputeDiagonal, (_cuVectorJJ, _cuVectorPK)); +- PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE)); +- PROFILE(ComputeBlockPC, (0.001f, true)); +- +- std::cout << "---------------------------------\n" +- "| Not storing original JC | \n" +- "---------------------------------\n"; +- __jc_store_original = false; +- PROFILE(EvaluateJacobians, ()); +- __jc_store_original = true; +- } +- ////////////////////////////////////////////////// +- +- std::cout << "---------------------------------\n" +- "| Not storing transpose JC | \n" +- "---------------------------------\n"; +- __jc_store_transpose = false; +- PROFILE(ComputeDiagonal, (_cuVectorJJ, _cuVectorPK)); +- PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE)); +- PROFILE(ComputeBlockPC, (0.001f, true)); +- +- ////////////////////////////////////// +- +- } else if (__jc_store_transpose) { +- PROFILE(ComputeDiagonal, (_cuVectorJJ, _cuVectorPK)); +- PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE)); +- PROFILE(ComputeBlockPC, (0.001f, true)); +- std::cout << "---------------------------------\n" +- "| Not storing original JC | \n" +- "---------------------------------\n"; +- PROFILE(EvaluateJacobians, ()); +- } +- } +- +- if (!__no_jacobian_store) { +- std::cout << "---------------------------------\n" +- "| Not storing Camera Jacobians | \n" +- "---------------------------------\n"; +- __jc_store_transpose = false; +- __jc_store_original = false; +- _cuJacobianCamera.ReleaseData(); +- _cuJacobianCameraT.ReleaseData(); +- PROFILE(EvaluateJacobians, ()); +- PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE)); +- PROFILE(ComputeBlockPC, (0.001f, true)); +- } +- +- /////////////////////////////////////////////// +- +- std::cout << "---------------------------------\n" +- "| Not storing any jacobians |\n" +- "---------------------------------\n"; +- __no_jacobian_store = true; +- _cuJacobianPoint.ReleaseData(); +- PROFILE(ComputeJX, (_cuVectorJtE, _cuVectorJX)); +- PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE)); +- PROFILE(ComputeBlockPC, (0.001f, true)); +- +- std::cout << "---------------------------------\n"; +-} +- +-void SparseBundleCU::RunDebugSteps() { +- EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj); +- EvaluateJacobians(); +- ComputeJtE(_cuImageProj, _cuVectorJtE); +- // DEBUG_FUNCN(_cuVectorXK, SolveNormalEquationPCGB, (0.001f), 100); +- DEBUG_FUNCN(_cuVectorJtE, ComputeJtE, (_cuImageProj, _cuVectorJtE), 100); +- DEBUG_FUNCN(_cuVectorJX, ComputeJX, (_cuVectorJtE, _cuVectorJX), 100); +-} +- +-void SparseBundleCU::SaveNormalEquation(float lambda) { +- ofstream out1("../../matlab/cg_j.txt"); +- ofstream out2("../../matlab/cg_b.txt"); +- ofstream out3("../../matlab/cg_x.txt"); +- +- out1 << std::setprecision(20); +- out2 << std::setprecision(20); +- out3 << std::setprecision(20); +- +- int plen = GetParameterLength(); +- vector jc(16 * _num_imgpt); +- vector jp(8 * _num_imgpt); +- vector ee(2 * _num_imgpt); +- vector dx(plen); +- +- _cuJacobianCamera.CopyToHost(&jc[0]); +- _cuJacobianPoint.CopyToHost(&jp[0]); +- _cuImageProj.CopyToHost(&ee[0]); +- _cuVectorXK.CopyToHost(&dx[0]); +- +- for (int i = 0; i < _num_imgpt; ++i) { +- out2 << ee[i * 2] << ' ' << ee[i * 2 + 1] << ' '; +- int cidx = _camera_idx[i], pidx = _point_idx[i]; +- float *cp = &jc[i * 16], *pp = &jp[i * 8]; +- int cmin = cidx * 8, pmin = 8 * _num_camera + pidx * 4; +- for (int j = 0; j < 8; ++j) +- out1 << (i * 2 + 1) << ' ' << (cmin + j + 1) << ' ' << cp[j] << '\n'; +- for (int j = 0; j < 8; ++j) +- out1 << (i * 2 + 2) << ' ' << (cmin + j + 1) << ' ' << cp[j + 8] << '\n'; +- for (int j = 0; j < 4; ++j) +- out1 << (i * 2 + 1) << ' ' << (pmin + j + 1) << ' ' << pp[j] << '\n'; +- for (int j = 0; j < 4; ++j) +- out1 << (i * 2 + 2) << ' ' << (pmin + j + 1) << ' ' << pp[j + 4] << '\n'; +- } +- +- for (size_t i = 0; i < dx.size(); ++i) out3 << dx[i] << ' '; +- +- std::cout << "lambda = " << std::setprecision(20) << lambda << '\n'; +-} +- +-} // namespace pba +diff --git a/lib/PBA/SparseBundleCU.h b/lib/PBA/SparseBundleCU.h +deleted file mode 100644 +index 7183deb67..000000000 +--- a/lib/PBA/SparseBundleCU.h ++++ /dev/null +@@ -1,176 +0,0 @@ +-//////////////////////////////////////////////////////////////////////////// +-// File: SparseBundleCU.h +-// Author: Changchang Wu (ccwu@cs.washington.edu) +-// Description : interface of the CUDA-version of multicore bundle +-// adjustment +-// +-// Copyright (c) 2011 Changchang Wu (ccwu@cs.washington.edu) +-// and the University of Washington at Seattle +-// +-// This library is free software; you can redistribute it and/or +-// modify it under the terms of the GNU General Public +-// License as published by the Free Software Foundation; either +-// Version 3 of the License, or (at your option) any later version. +-// +-// This library is distributed in the hope that it will be useful, +-// but WITHOUT ANY WARRANTY; without even the implied warranty of +-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +-// General Public License for more details. +-// +-//////////////////////////////////////////////////////////////////////////////// +- +-#if !defined(SPARSE_BUNDLE_CU_H) +-#define SPARSE_BUNDLE_CU_H +- +-#include "ConfigBA.h" +-#include "CuTexImage.h" +-#include "DataInterface.h" +- +-namespace pba { +- +-class SparseBundleCU : public ParallelBA, public ConfigBA { +- protected: // cpu data +- int _num_camera; +- int _num_point; +- int _num_imgpt; +- CameraT* _camera_data; +- float* _point_data; +- //////////////////////////////// +- const float* _imgpt_data; +- const int* _camera_idx; +- const int* _point_idx; +- const int* _focal_mask; +- std::vector _imgpt_datax; +- //////////////////////// +- float _projection_sse; // sumed square error +- protected: // cuda data +- CuTexImage _cuCameraData; +- CuTexImage _cuCameraDataEX; +- CuTexImage _cuPointData; +- CuTexImage _cuPointDataEX; +- CuTexImage _cuMeasurements; +- CuTexImage _cuImageProj; +- CuTexImage _cuJacobianCamera; +- CuTexImage _cuJacobianPoint; +- CuTexImage _cuJacobianCameraT; +- CuTexImage _cuProjectionMap; +- CuTexImage _cuPointMeasurementMap; +- CuTexImage _cuCameraMeasurementMap; +- CuTexImage _cuCameraMeasurementList; +- CuTexImage _cuCameraMeasurementListT; +- +- /////////////////////////////// +- CuTexImage _cuBufferData; +- //////////////////////////// +- CuTexImage _cuBlockPC; +- CuTexImage _cuVectorSJ; +- +- /// LM normal equation +- CuTexImage _cuVectorJtE; +- CuTexImage _cuVectorJJ; +- CuTexImage _cuVectorJX; +- CuTexImage _cuVectorXK; +- CuTexImage _cuVectorPK; +- CuTexImage _cuVectorZK; +- CuTexImage _cuVectorRK; +- +- /////////////////////// +- protected: +- int _num_imgpt_q; +- float _weight_q; +- CuTexImage _cuCameraQList; +- CuTexImage _cuCameraQMap; +- CuTexImage _cuCameraQMapW; +- CuTexImage _cuCameraQListW; +- +- protected: +- bool ProcessIndexCameraQ(std::vector& qmap, std::vector& qlist); +- void ProcessWeightCameraQ(std::vector& cpnum, std::vector& qmap, +- std::vector& qmapw, +- std::vector& qlistw); +- +- protected: // internal functions +- int GetParameterLength(); +- int InitializeBundle(); +- int ValidateInputData(); +- void ReleaseAllocatedData(); +- bool InitializeStorageForCG(); +- bool InitializeBundleGPU(); +- bool TransferDataToGPU(); +- void TransferDataToHost(); +- void DenormalizeData(); +- void NormalizeData(); +- void NormalizeDataF(); +- void NormalizeDataD(); +- void DebugProjections(); +- void RunDebugSteps(); +- bool CheckRequiredMem(int fresh = 1); +- bool CheckRequiredMemX(); +- void ReserveStorage(size_t ncam, size_t npt, size_t nproj); +- void ReserveStorageAuto(); +- +- protected: +- float EvaluateProjection(CuTexImage& cam, CuTexImage& point, +- CuTexImage& proj); +- float EvaluateProjectionX(CuTexImage& cam, CuTexImage& point, +- CuTexImage& proj); +- float UpdateCameraPoint(CuTexImage& dx, CuTexImage& cuImageTempProj); +- float SaveUpdatedSystem(float residual_reduction, float dx_sqnorm, +- float damping); +- float EvaluateDeltaNorm(); +- void EvaluateJacobians(bool shuffle = true); +- void PrepareJacobianNormalization(); +- void ComputeJtE(CuTexImage& E, CuTexImage& JtE, int mode = 0); +- void ComputeJX(CuTexImage& X, CuTexImage& JX, int mode = 0); +- void ComputeDiagonal(CuTexImage& JJ, CuTexImage& JJI); +- void ComputeBlockPC(float lambda, bool dampd = true); +- void ApplyBlockPC(CuTexImage& v, CuTexImage& pv, int mode = 0); +- int SolveNormalEquationPCGB(float lambda); +- int SolveNormalEquationPCGX(float lambda); +- int SolveNormalEquation(float lambda); +- void AdjustBundleAdjsutmentMode(); +- void NonlinearOptimizeLM(); +- void BundleAdjustment(); +- void RunTestIterationLM(bool reduced); +- void SaveBundleRecord(int iter, float res, float damping, float& g_norm, +- float& g_inf); +- ///////////////////////////////// +- void SaveNormalEquation(float lambda); +- void RunProfileSteps(); +- void WarmupDevice(); +- +- public: +- virtual float GetMeanSquaredError(); +- virtual void SetCameraData(size_t ncam, CameraT* cams); +- virtual void SetPointData(size_t npoint, Point3D* pts); +- virtual void SetProjection(size_t nproj, const Point2D* imgpts, +- const int* point_idx, const int* cam_idx); +- virtual void SetFocalMask(const int* fmask, float weight); +- virtual int RunBundleAdjustment(); +- +- /// +- virtual void AbortBundleAdjustment() { __abort_flag = true; } +- virtual int GetCurrentIteration() { return __current_iteration; } +- virtual void SetNextTimeBudget(int seconds) { +- __bundle_time_budget = seconds; +- } +- virtual void SetNextBundleMode(BundleModeT mode) { +- __bundle_mode_next = mode; +- } +- virtual void SetFixedIntrinsics(bool fixed) { __fixed_intrinsics = fixed; } +- virtual void EnableRadialDistortion(DistortionT type) { +- __use_radial_distortion = type; +- } +- virtual void ParseParam(int narg, char** argv) { +- ConfigBA::ParseParam(narg, argv); +- } +- virtual ConfigBA* GetInternalConfig() { return this; } +- +- public: +- SparseBundleCU(int device); +- size_t GetMemCapacity(); +-}; +- +-} // namespace pba +- +-#endif +diff --git a/lib/PBA/pba.cpp b/lib/PBA/pba.cpp +deleted file mode 100644 +index 77d62b070..000000000 +--- a/lib/PBA/pba.cpp ++++ /dev/null +@@ -1,134 +0,0 @@ +-//////////////////////////////////////////////////////////////////////////// +-// File: pba.cpp +-// Author: Changchang Wu +-// Description : implementation of ParallelBA, which is a wrapper around +-// the GPU-based and CPU-based implementations +-// +-// Copyright (c) 2011 Changchang Wu (ccwu@cs.washington.edu) +-// and the University of Washington at Seattle +-// +-// This library is free software; you can redistribute it and/or +-// modify it under the terms of the GNU General Public +-// License as published by the Free Software Foundation; either +-// Version 3 of the License, or (at your option) any later version. +-// +-// This library is distributed in the hope that it will be useful, +-// but WITHOUT ANY WARRANTY; without even the implied warranty of +-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +-// General Public License for more details. +-// +-//////////////////////////////////////////////////////////////////////////////// +-#include +-#include +-#include "pba.h" +-#include "SparseBundleCU.h" +-#include "SparseBundleCPU.h" +- +-namespace pba { +- +-ParallelBA::ParallelBA(DeviceT device, const int num_threads) { +- // The wrapper intends to provide different implementations. +- +- if (device >= PBA_CUDA_DEVICE_DEFAULT) +-#ifndef PBA_NO_GPU +- { +- SparseBundleCU* cuba = new SparseBundleCU(device - PBA_CUDA_DEVICE0); +- if (cuba->GetMemCapacity() > 0) { +- _optimizer = cuba; +- } else { +- device = PBA_CPU_FLOAT; +- _optimizer = NewSparseBundleCPU(false, num_threads); +- delete cuba; +- } +- } else +-#else +- device = PBA_CPU_FLOAT; +-#endif +- if (device == PBA_CPU_FLOAT) +- _optimizer = NewSparseBundleCPU(false, num_threads); +- else if (device == PBA_CPU_DOUBLE) +- _optimizer = NewSparseBundleCPU(true, num_threads); +- else +- _optimizer = NULL; +-} +- +-ParallelBA::~ParallelBA() { +- if (_optimizer) delete _optimizer; +-} +- +-void ParallelBA::ParseParam(int narg, char** argv) { +- _optimizer->ParseParam(narg, argv); +-} +- +-ConfigBA* ParallelBA::GetInternalConfig() { +- if (_optimizer) +- return _optimizer->GetInternalConfig(); +- else +- return NULL; +-} +- +-void ParallelBA::SetFixedIntrinsics(bool fixed) { +- _optimizer->SetFixedIntrinsics(fixed); +-} +-void ParallelBA::EnableRadialDistortion(DistortionT enabled) { +- _optimizer->EnableRadialDistortion(enabled); +-} +-void ParallelBA::SetNextTimeBudget(int seconds) { +- _optimizer->SetNextTimeBudget(seconds); +-} +- +-void ParallelBA::SetNextBundleMode(BundleModeT mode) { +- _optimizer->SetNextBundleMode(mode); +-} +- +-void ParallelBA::SetCameraData(size_t ncam, CameraT* cams) { +- _optimizer->SetCameraData(ncam, cams); +-} +- +-void ParallelBA::SetPointData(size_t npoint, Point3D* pts) { +- _optimizer->SetPointData(npoint, pts); +-} +- +-void ParallelBA::SetProjection(size_t nproj, const Point2D* imgpts, +- const int* point_idx, const int* cam_idx) { +- _optimizer->SetProjection(nproj, imgpts, point_idx, cam_idx); +-} +-int ParallelBA::RunBundleAdjustment() { +- return _optimizer->RunBundleAdjustment(); +-} +- +-float ParallelBA::GetMeanSquaredError() { +- return _optimizer->GetMeanSquaredError(); +-} +- +-int ParallelBA::GetCurrentIteration() { +- return _optimizer->GetCurrentIteration(); +-} +-void ParallelBA::AbortBundleAdjustment() { +- return _optimizer->AbortBundleAdjustment(); +-} +- +-void ParallelBA::ReserveStorage(size_t ncam, size_t npt, size_t nproj) { +- if (_optimizer) _optimizer->ReserveStorage(ncam, npt, nproj); +-} +- +-void ParallelBA::SetFocalMask(const int* fmask, float weight) { +- if (_optimizer && weight > 0) _optimizer->SetFocalMask(fmask, weight); +-} +- +-// void* ParallelBA::operator new(size_t size) { +-// void* p = malloc(size); +-// if (p == 0) { +-// const std::bad_alloc ba; +-// throw ba; +-// } +-// return p; +-// } +- +-ParallelBA* NewParallelBA(ParallelBA::DeviceT device) { +- return new ParallelBA(device); +-} +- +-int ParallelBA_GetVersion() { return 105; } +- +-} // namespace pba +diff --git a/lib/PBA/pba.h b/lib/PBA/pba.h +deleted file mode 100644 +index 3ebf5813f..000000000 +--- a/lib/PBA/pba.h ++++ /dev/null +@@ -1,156 +0,0 @@ +-//////////////////////////////////////////////////////////////////////////// +-// File: pba.h +-// Author: Changchang Wu (ccwu@cs.washington.edu) +-// Description : interface of class ParallelBA, which has two +-//implementations +-// SparseBundleCU for CUDA-based version, and +-// SparseBundleCPU for CPU multi-threading version +-// +-// Copyright (c) 2011 Changchang Wu (ccwu@cs.washington.edu) +-// and the University of Washington at Seattle +-// +-// This library is free software; you can redistribute it and/or +-// modify it under the terms of the GNU General Public +-// License as published by the Free Software Foundation; either +-// Version 3 of the License, or (at your option) any later version. +-// +-// This library is distributed in the hope that it will be useful, +-// but WITHOUT ANY WARRANTY; without even the implied warranty of +-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +-// General Public License for more details. +-// +-//////////////////////////////////////////////////////////////////////////////// +- +-#ifndef PARALLEL_BA_H +-#define PARALLEL_BA_H +- +-#if defined(_WIN32) +-#ifdef PBA_DLL +-#ifdef DLL_EXPORT +-#define PBA_EXPORT __declspec(dllexport) +-#else +-#define PBA_EXPORT __declspec(dllimport) +-#endif +-#else +-#define PBA_EXPORT +-#endif +- +-#define PBA_EXPORT_EXTERN PBA_EXPORT +- +-#if _MSC_VER > 1000 +-#pragma once +-#endif +-#else +-#define PBA_EXPORT +-#define PBA_EXPORT_EXTERN extern "C" +-#endif +- +-// filetype definitions for points and camera +-#include "DataInterface.h" +-#include "ConfigBA.h" +- +-namespace pba { +- +-class ParallelBA { +- public: +- enum StatusT { +- STATUS_SUCCESS = 0, +- STATUS_CAMERA_MISSING = 1, +- STATUS_POINT_MISSING, +- STATUS_PROJECTION_MISSING, +- STATUS_MEASURMENT_MISSING, +- STATUS_ALLOCATION_FAIL +- }; +- enum DeviceT { +- PBA_INVALID_DEVICE = -4, +- PBA_CPU_DOUBLE = -3, +- PBA_CPU_FLOAT = -2, +- PBA_CUDA_DEVICE_DEFAULT = -1, +- PBA_CUDA_DEVICE0 = 0 +- }; +- enum DistortionT { +- PBA_MEASUREMENT_DISTORTION = -1, // single parameter, apply to measurements +- PBA_NO_DISTORTION = 0, // no radial distortion +- PBA_PROJECTION_DISTORTION = 1 // single parameter, apply to projectino +- }; +- enum BundleModeT { +- BUNDLE_FULL = 0, +- BUNDLE_ONLY_MOTION = 1, +- BUNDLE_ONLY_STRUCTURE = 2, +- }; +- +- private: +- ParallelBA* _optimizer; +- +- public: +- //////////////////////////////////////////////////// +- // methods for changing bundle adjustment settings +- PBA_EXPORT virtual void ParseParam(int narg, char** argv); // indirect method +- PBA_EXPORT virtual ConfigBA* GetInternalConfig(); // direct method +- PBA_EXPORT virtual void SetFixedIntrinsics( +- bool fixed); // call this for calibrated system +- PBA_EXPORT virtual void EnableRadialDistortion( +- DistortionT type); // call this to enable radial distortion +- PBA_EXPORT virtual void SetNextTimeBudget( +- int seconds); //# of seconds for next run (0 = no limit) +- PBA_EXPORT virtual void ReserveStorage(size_t ncam, size_t npt, size_t nproj); +- +- public: +- // function name change; the old one is mapped as inline function +- inline void SetFocalLengthFixed(bool fixed) { SetFixedIntrinsics(fixed); } +- inline void ResetBundleStorage() { +- ReserveStorage(0, 0, 0); /*Reset devide for CUDA*/ +- } +- +- public: +- ///////////////////////////////////////////////////// +- // optimizer interface, input and run +- PBA_EXPORT virtual void SetCameraData(size_t ncam, +- CameraT* cams); // set camera data +- PBA_EXPORT virtual void SetPointData(size_t npoint, +- Point3D* pts); // set 3D point data +- PBA_EXPORT virtual void SetProjection(size_t nproj, const Point2D* imgpts, +- const int* point_idx, +- const int* cam_idx); // set projections +- PBA_EXPORT virtual void SetNextBundleMode( +- BundleModeT +- mode = BUNDLE_FULL); // mode of the next bundle adjustment call +- PBA_EXPORT virtual int RunBundleAdjustment(); // start bundle adjustment, +- // return number of successful +- // LM iterations +- public: +- ////////////////////////////////////////////////// +- // Query optimzer runing status for Multi-threading +- // Three functions below can be called from a differnt thread while bundle is +- // running +- PBA_EXPORT virtual float +- GetMeanSquaredError(); // read back results during/after BA +- PBA_EXPORT virtual void +- AbortBundleAdjustment(); // tell bundle adjustment to abort ASAP +- PBA_EXPORT virtual int +- GetCurrentIteration(); // which iteration is it working on? +- public: +- PBA_EXPORT ParallelBA(DeviceT device = PBA_CUDA_DEVICE_DEFAULT, +- const int num_threads = -1); +- // PBA_EXPORT void* operator new(size_t size); +- PBA_EXPORT virtual ~ParallelBA(); +- +- public: +- ////////////////////////////////////////////// +- // Future functions will be added to the end for compatiability with old +- // version. +- PBA_EXPORT virtual void SetFocalMask(const int* fmask, float weight = 1.0f); +-}; +- +-// function for dynamic loading of library +-PBA_EXPORT_EXTERN ParallelBA* NewParallelBA( +- ParallelBA::DeviceT device = ParallelBA::PBA_CUDA_DEVICE_DEFAULT); +-typedef ParallelBA* (*NEWPARALLELBAPROC)(ParallelBA::DeviceT); +- +-/////////////////////////////////////////////// +-// older versions do not have this function. +-PBA_EXPORT_EXTERN int ParallelBA_GetVersion(); +- +-} // namespace pba +- +-#endif +diff --git a/lib/PBA/util.h b/lib/PBA/util.h +deleted file mode 100644 +index a63c8bbce..000000000 +--- a/lib/PBA/util.h ++++ /dev/null +@@ -1,753 +0,0 @@ +-//////////////////////////////////////////////////////////////////////////// +-// File: util.h +-// Author: Changchang Wu (ccwu@cs.washington.edu) +-// Description : some utility functions for reading/writing SfM data +-// +-// Copyright (c) 2011 Changchang Wu (ccwu@cs.washington.edu) +-// and the University of Washington at Seattle +-// +-// This library is free software; you can redistribute it and/or +-// modify it under the terms of the GNU General Public +-// License as published by the Free Software Foundation; either +-// Version 3 of the License, or (at your option) any later version. +-// +-// This library is distributed in the hope that it will be useful, +-// but WITHOUT ANY WARRANTY; without even the implied warranty of +-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +-// General Public License for more details. +-// +-//////////////////////////////////////////////////////////////////////////////// +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-using namespace std; +-#include "DataInterface.h" +- +-namespace pba { +- +-// File loader supports .nvm format and bundler format +-bool LoadModelFile(const char* name, vector& camera_data, +- vector& point_data, vector& measurements, +- vector& ptidx, vector& camidx, +- vector& names, vector& ptc); +-void SaveNVM(const char* filename, vector& camera_data, +- vector& point_data, vector& measurements, +- vector& ptidx, vector& camidx, vector& names, +- vector& ptc); +-void SaveBundlerModel(const char* filename, vector& camera_data, +- vector& point_data, +- vector& measurements, vector& ptidx, +- vector& camidx); +- +-////////////////////////////////////////////////////////////////// +-void AddNoise(vector& camera_data, vector& point_data, +- float percent); +-void AddStableNoise(vector& camera_data, vector& point_data, +- const vector& ptidx, const vector& camidx, +- float percent); +-bool RemoveInvisiblePoints(vector& camera_data, +- vector& point_data, vector& ptidx, +- vector& camidx, vector& measurements, +- vector& names, vector& ptc); +- +-///////////////////////////////////////////////////////////////////////////// +-bool LoadNVM(ifstream& in, vector& camera_data, +- vector& point_data, vector& measurements, +- vector& ptidx, vector& camidx, vector& names, +- vector& ptc) { +- int rotation_parameter_num = 4; +- bool format_r9t = false; +- string token; +- if (in.peek() == 'N') { +- in >> token; // file header +- if (strstr(token.c_str(), "R9T")) { +- rotation_parameter_num = 9; // rotation as 3x3 matrix +- format_r9t = true; +- } +- } +- +- int ncam = 0, npoint = 0, nproj = 0; +- // read # of cameras +- in >> ncam; +- if (ncam <= 1) return false; +- +- // read the camera parameters +- camera_data.resize(ncam); // allocate the camera data +- names.resize(ncam); +- for (int i = 0; i < ncam; ++i) { +- double f, q[9], c[3], d[2]; +- in >> token >> f; +- for (int j = 0; j < rotation_parameter_num; ++j) in >> q[j]; +- in >> c[0] >> c[1] >> c[2] >> d[0] >> d[1]; +- +- camera_data[i].SetFocalLength(f); +- if (format_r9t) { +- camera_data[i].SetMatrixRotation(q); +- camera_data[i].SetTranslation(c); +- } else { +- // older format for compability +- camera_data[i].SetQuaternionRotation(q); // quaternion from the file +- camera_data[i].SetCameraCenterAfterRotation( +- c); // camera center from the file +- } +- camera_data[i].SetNormalizedMeasurementDistortion(d[0]); +- names[i] = token; +- } +- +- ////////////////////////////////////// +- in >> npoint; +- if (npoint <= 0) return false; +- +- // read image projections and 3D points. +- point_data.resize(npoint); +- for (int i = 0; i < npoint; ++i) { +- float pt[3]; +- int cc[3], npj; +- in >> pt[0] >> pt[1] >> pt[2] >> cc[0] >> cc[1] >> cc[2] >> npj; +- for (int j = 0; j < npj; ++j) { +- int cidx, fidx; +- float imx, imy; +- in >> cidx >> fidx >> imx >> imy; +- +- camidx.push_back(cidx); // camera index +- ptidx.push_back(i); // point index +- +- // add a measurment to the vector +- measurements.push_back(Point2D(imx, imy)); +- nproj++; +- } +- point_data[i].SetPoint(pt); +- ptc.insert(ptc.end(), cc, cc + 3); +- } +- /////////////////////////////////////////////////////////////////////////////// +- std::cout << ncam << " cameras; " << npoint << " 3D points; " << nproj +- << " projections\n"; +- +- return true; +-} +- +-void SaveNVM(const char* filename, vector& camera_data, +- vector& point_data, vector& measurements, +- vector& ptidx, vector& camidx, vector& names, +- vector& ptc) { +- std::cout << "Saving model to " << filename << "...\n"; +- ofstream out(filename); +- +- out << "NVM_V3_R9T\n" << camera_data.size() << '\n' << std::setprecision(12); +- if (names.size() < camera_data.size()) +- names.resize(camera_data.size(), string("unknown")); +- if (ptc.size() < 3 * point_data.size()) ptc.resize(point_data.size() * 3, 0); +- +- //////////////////////////////////// +- for (size_t i = 0; i < camera_data.size(); ++i) { +- CameraT& cam = camera_data[i]; +- out << names[i] << ' ' << cam.GetFocalLength() << ' '; +- for (int j = 0; j < 9; ++j) out << cam.m[0][j] << ' '; +- out << cam.t[0] << ' ' << cam.t[1] << ' ' << cam.t[2] << ' ' +- << cam.GetNormalizedMeasurementDistortion() << " 0\n"; +- } +- +- out << point_data.size() << '\n'; +- +- for (size_t i = 0, j = 0; i < point_data.size(); ++i) { +- Point3D& pt = point_data[i]; +- int* pc = &ptc[i * 3]; +- out << pt.xyz[0] << ' ' << pt.xyz[1] << ' ' << pt.xyz[2] << ' ' << pc[0] +- << ' ' << pc[1] << ' ' << pc[2] << ' '; +- +- size_t je = j; +- while (je < ptidx.size() && ptidx[je] == (int)i) je++; +- +- out << (je - j) << ' '; +- +- for (; j < je; ++j) +- out << camidx[j] << ' ' << " 0 " << measurements[j].x << ' ' +- << measurements[j].y << ' '; +- +- out << '\n'; +- } +-} +- +-bool LoadBundlerOut(const char* name, ifstream& in, +- vector& camera_data, vector& point_data, +- vector& measurements, vector& ptidx, +- vector& camidx, vector& names, +- vector& ptc) { +- int rotation_parameter_num = 9; +- string token; +- while (in.peek() == '#') std::getline(in, token); +- +- char listpath[1024], filepath[1024]; +- strcpy(listpath, name); +- char* ext = strstr(listpath, ".out"); +- strcpy(ext, "-list.txt\0"); +- +- /////////////////////////////////// +- ifstream listin(listpath); +- if (!listin.is_open()) { +- listin.close(); +- listin.clear(); +- char* slash = strrchr(listpath, '/'); +- if (slash == NULL) slash = strrchr(listpath, '\\'); +- slash = slash ? slash + 1 : listpath; +- strcpy(slash, "image_list.txt"); +- listin.open(listpath); +- } +- if (listin) std::cout << "Using image list: " << listpath << '\n'; +- +- // read # of cameras +- int ncam = 0, npoint = 0, nproj = 0; +- in >> ncam >> npoint; +- if (ncam <= 1 || npoint <= 1) return false; +- std::cout << ncam << " cameras; " << npoint << " 3D points;\n"; +- +- // read the camera parameters +- camera_data.resize(ncam); // allocate the camera data +- names.resize(ncam); +- +- bool det_checked = false; +- for (int i = 0; i < ncam; ++i) { +- float f, q[9], c[3], d[2]; +- in >> f >> d[0] >> d[1]; +- for (int j = 0; j < rotation_parameter_num; ++j) in >> q[j]; +- in >> c[0] >> c[1] >> c[2]; +- +- camera_data[i].SetFocalLength(f); +- camera_data[i].SetInvertedR9T(q, c); +- camera_data[i].SetProjectionDistortion(d[0]); +- +- if (listin >> filepath && f != 0) { +- char* slash = strrchr(filepath, '/'); +- if (slash == NULL) slash = strchr(filepath, '\\'); +- names[i] = (slash ? (slash + 1) : filepath); +- std::getline(listin, token); +- +- if (!det_checked) { +- float det = camera_data[i].GetRotationMatrixDeterminant(); +- std::cout << "Check rotation matrix: " << det << '\n'; +- det_checked = true; +- } +- } else { +- names[i] = "unknown"; +- } +- } +- +- // read image projections and 3D points. +- point_data.resize(npoint); +- for (int i = 0; i < npoint; ++i) { +- float pt[3]; +- int cc[3], npj; +- in >> pt[0] >> pt[1] >> pt[2] >> cc[0] >> cc[1] >> cc[2] >> npj; +- for (int j = 0; j < npj; ++j) { +- int cidx, fidx; +- float imx, imy; +- in >> cidx >> fidx >> imx >> imy; +- +- camidx.push_back(cidx); // camera index +- ptidx.push_back(i); // point index +- +- // add a measurment to the vector +- measurements.push_back(Point2D(imx, -imy)); +- nproj++; +- } +- point_data[i].SetPoint(pt[0], pt[1], pt[2]); +- ptc.insert(ptc.end(), cc, cc + 3); +- } +- /////////////////////////////////////////////////////////////////////////////// +- std::cout << ncam << " cameras; " << npoint << " 3D points; " << nproj +- << " projections\n"; +- return true; +-} +- +-void SaveBundlerOut(const char* filename, vector& camera_data, +- vector& point_data, vector& measurements, +- vector& ptidx, vector& camidx, +- vector& names, vector& ptc) { +- char listpath[1024]; +- strcpy(listpath, filename); +- char* ext = strstr(listpath, ".out"); +- if (ext == NULL) return; +- strcpy(ext, "-list.txt\0"); +- +- ofstream out(filename); +- out << "# Bundle file v0.3\n"; +- out << std::setprecision(12); // need enough precision +- out << camera_data.size() << " " << point_data.size() << '\n'; +- +- // save camera data +- for (size_t i = 0; i < camera_data.size(); ++i) { +- float q[9], c[3]; +- CameraT& ci = camera_data[i]; +- out << ci.GetFocalLength() << ' ' << ci.GetProjectionDistortion() << " 0\n"; +- ci.GetInvertedR9T(q, c); +- for (int j = 0; j < 9; ++j) out << q[j] << (((j % 3) == 2) ? '\n' : ' '); +- out << c[0] << ' ' << c[1] << ' ' << c[2] << '\n'; +- } +- /// +- for (size_t i = 0, j = 0; i < point_data.size(); ++i) { +- int npj = 0, *ci = &ptc[i * 3]; +- Point3D& pt = point_data[i]; +- while (j + npj < point_data.size() && ptidx[j + npj] == ptidx[j]) npj++; +- /////////////////////////// +- out << pt.xyz[0] << ' ' << pt.xyz[1] << ' ' << pt.xyz[2] << '\n'; +- out << ci[0] << ' ' << ci[1] << ' ' << ci[2] << '\n'; +- out << npj << ' '; +- for (int k = 0; k < npj; ++k) +- out << camidx[j + k] << " 0 " << measurements[j + k].x << ' ' +- << -measurements[j + k].y << '\n'; +- out << '\n'; +- j += npj; +- } +- +- ofstream listout(listpath); +- for (size_t i = 0; i < names.size(); ++i) listout << names[i] << '\n'; +-} +- +-template +-bool LoadBundlerModel(ifstream& in, vector& camera_data, +- vector& point_data, +- vector& measurements, vector& ptidx, +- vector& camidx) { +- // read bundle data from a file +- size_t ncam = 0, npt = 0, nproj = 0; +- if (!(in >> ncam >> npt >> nproj)) return false; +- /////////////////////////////////////////////////////////////////////////////// +- std::cout << ncam << " cameras; " << npt << " 3D points; " << nproj +- << " projections\n"; +- +- camera_data.resize(ncam); +- point_data.resize(npt); +- measurements.resize(nproj); +- camidx.resize(nproj); +- ptidx.resize(nproj); +- +- for (size_t i = 0; i < nproj; ++i) { +- double x, y; +- int cidx, pidx; +- in >> cidx >> pidx >> x >> y; +- if (((size_t)pidx) == npt && camidx.size() > i) { +- camidx.resize(i); +- ptidx.resize(i); +- measurements.resize(i); +- std::cout << "Truncate measurements to " << i << '\n'; +- } else if (((size_t)pidx) >= npt) { +- continue; +- } else { +- camidx[i] = cidx; +- ptidx[i] = pidx; +- measurements[i].SetPoint2D(x, -y); +- } +- } +- +- for (size_t i = 0; i < ncam; ++i) { +- double p[9]; +- for (int j = 0; j < 9; ++j) in >> p[j]; +- CameraT& cam = camera_data[i]; +- cam.SetFocalLength(p[6]); +- cam.SetInvertedRT(p, p + 3); +- cam.SetProjectionDistortion(p[7]); +- } +- +- for (size_t i = 0; i < npt; ++i) { +- double pt[3]; +- in >> pt[0] >> pt[1] >> pt[2]; +- point_data[i].SetPoint(pt); +- } +- return true; +-} +- +-void SaveBundlerModel(const char* filename, vector& camera_data, +- vector& point_data, +- vector& measurements, vector& ptidx, +- vector& camidx) { +- std::cout << "Saving model to " << filename << "...\n"; +- ofstream out(filename); +- out << std::setprecision(12); // need enough precision +- out << camera_data.size() << ' ' << point_data.size() << ' ' +- << measurements.size() << '\n'; +- for (size_t i = 0; i < measurements.size(); ++i) { +- out << camidx[i] << ' ' << ptidx[i] << ' ' << measurements[i].x << ' ' +- << -measurements[i].y << '\n'; +- } +- +- for (size_t i = 0; i < camera_data.size(); ++i) { +- CameraT& cam = camera_data[i]; +- double r[3], t[3]; +- cam.GetInvertedRT(r, t); +- out << r[0] << ' ' << r[1] << ' ' << r[2] << ' ' << t[0] << ' ' << t[1] +- << ' ' << t[2] << ' ' << cam.f << ' ' << cam.GetProjectionDistortion() +- << " 0\n"; +- } +- +- for (size_t i = 0; i < point_data.size(); ++i) { +- Point3D& pt = point_data[i]; +- out << pt.xyz[0] << ' ' << pt.xyz[1] << ' ' << pt.xyz[2] << '\n'; +- } +-} +- +-bool LoadModelFile(const char* name, vector& camera_data, +- vector& point_data, vector& measurements, +- vector& ptidx, vector& camidx, +- vector& names, vector& ptc) { +- if (name == NULL) return false; +- ifstream in(name); +- +- std::cout << "Loading cameras/points: " << name << "\n"; +- if (!in.is_open()) return false; +- +- if (strstr(name, ".nvm")) +- return LoadNVM(in, camera_data, point_data, measurements, ptidx, camidx, +- names, ptc); +- else if (strstr(name, ".out")) +- return LoadBundlerOut(name, in, camera_data, point_data, measurements, +- ptidx, camidx, names, ptc); +- else +- return LoadBundlerModel(in, camera_data, point_data, measurements, ptidx, +- camidx); +-} +- +-float random_ratio(float percent) { +- return (rand() % 101 - 50) * 0.02f * percent + 1.0f; +-} +- +-void AddNoise(vector& camera_data, vector& point_data, +- float percent) { +- std::srand((unsigned int)time(NULL)); +- for (size_t i = 0; i < camera_data.size(); ++i) { +- camera_data[i].f *= random_ratio(percent); +- camera_data[i].t[0] *= random_ratio(percent); +- camera_data[i].t[1] *= random_ratio(percent); +- camera_data[i].t[2] *= random_ratio(percent); +- double e[3]; +- camera_data[i].GetRodriguesRotation(e); +- e[0] *= random_ratio(percent); +- e[1] *= random_ratio(percent); +- e[2] *= random_ratio(percent); +- camera_data[i].SetRodriguesRotation(e); +- } +- +- for (size_t i = 0; i < point_data.size(); ++i) { +- point_data[i].xyz[0] *= random_ratio(percent); +- point_data[i].xyz[1] *= random_ratio(percent); +- point_data[i].xyz[2] *= random_ratio(percent); +- } +-} +- +-void AddStableNoise(vector& camera_data, vector& point_data, +- const vector& ptidx, const vector& camidx, +- float percent) { +- /// +- std::srand((unsigned int)time(NULL)); +- // do not modify the visibility status.. +- vector zz0(ptidx.size()); +- vector backup = camera_data; +- vector vx(point_data.size()), vy(point_data.size()), +- vz(point_data.size()); +- for (size_t i = 0; i < point_data.size(); ++i) { +- Point3D& pt = point_data[i]; +- vx[i] = pt.xyz[0]; +- vy[i] = pt.xyz[1]; +- vz[i] = pt.xyz[2]; +- } +- +- // find out the median location of all the 3D points. +- size_t median_idx = point_data.size() / 2; +- +- std::nth_element(vx.begin(), vx.begin() + median_idx, vx.end()); +- std::nth_element(vy.begin(), vy.begin() + median_idx, vy.end()); +- std::nth_element(vz.begin(), vz.begin() + median_idx, vz.end()); +- float cx = vx[median_idx], cy = vy[median_idx], cz = vz[median_idx]; +- +- for (size_t i = 0; i < ptidx.size(); ++i) { +- CameraT& cam = camera_data[camidx[i]]; +- Point3D& pt = point_data[ptidx[i]]; +- zz0[i] = cam.m[2][0] * pt.xyz[0] + cam.m[2][1] * pt.xyz[1] + +- cam.m[2][2] * pt.xyz[2] + cam.t[2]; +- } +- +- vector z2 = zz0; +- median_idx = ptidx.size() / 2; +- std::nth_element(z2.begin(), z2.begin() + median_idx, z2.end()); +- float mz = z2[median_idx]; // median depth +- float dist_noise_base = mz * 0.2f; +- +- ///////////////////////////////////////////////// +- // modify points first.. +- for (size_t i = 0; i < point_data.size(); ++i) { +- Point3D& pt = point_data[i]; +- pt.xyz[0] = pt.xyz[0] - cx + dist_noise_base * random_ratio(percent); +- pt.xyz[1] = pt.xyz[1] - cy + dist_noise_base * random_ratio(percent); +- pt.xyz[2] = pt.xyz[2] - cz + dist_noise_base * random_ratio(percent); +- } +- +- vector need_modification(camera_data.size(), true); +- int invalid_count = 0, modify_iteration = 1; +- +- do { +- if (invalid_count) +- std::cout << "NOTE" << std::setw(2) << modify_iteration << ": modify " +- << invalid_count << " camera to fix visibility\n"; +- +- ////////////////////////////////////////////////////// +- for (size_t i = 0; i < camera_data.size(); ++i) { +- if (!need_modification[i]) continue; +- CameraT& cam = camera_data[i]; +- double e[3], c[3]; +- cam = backup[i]; +- cam.f *= random_ratio(percent); +- +- /////////////////////////////////////////////////////////// +- cam.GetCameraCenter(c); +- c[0] = c[0] - cx + dist_noise_base * random_ratio(percent); +- c[1] = c[1] - cy + dist_noise_base * random_ratio(percent); +- c[2] = c[2] - cz + dist_noise_base * random_ratio(percent); +- +- /////////////////////////////////////////////////////////// +- cam.GetRodriguesRotation(e); +- e[0] *= random_ratio(percent); +- e[1] *= random_ratio(percent); +- e[2] *= random_ratio(percent); +- +- /////////////////////////////////////////////////////////// +- cam.SetRodriguesRotation(e); +- cam.SetCameraCenterAfterRotation(c); +- } +- vector invalidc(camera_data.size(), false); +- +- invalid_count = 0; +- for (size_t i = 0; i < ptidx.size(); ++i) { +- int cid = camidx[i]; +- if (need_modification[cid] == false) continue; +- if (invalidc[cid]) continue; +- CameraT& cam = camera_data[cid]; +- Point3D& pt = point_data[ptidx[i]]; +- float z = cam.m[2][0] * pt.xyz[0] + cam.m[2][1] * pt.xyz[1] + +- cam.m[2][2] * pt.xyz[2] + cam.t[2]; +- if (z * zz0[i] > 0) continue; +- if (zz0[i] == 0 && z > 0) continue; +- invalid_count++; +- invalidc[cid] = true; +- } +- +- need_modification = invalidc; +- modify_iteration++; +- +- } while (invalid_count && modify_iteration < 20); +-} +- +-void ExamineVisiblity(const char* input_filename) { +- ////////////// +- vector camera_data; +- vector point_data; +- vector ptidx, camidx; +- vector measurements; +- ifstream in(input_filename); +- LoadBundlerModel(in, camera_data, point_data, measurements, ptidx, camidx); +- +- //////////////// +- int count = 0; +- double d1 = 100, d2 = 100; +- std::cout << "checking visibility...\n"; +- vector zz(ptidx.size()); +- for (size_t i = 0; i < ptidx.size(); ++i) { +- CameraD& cam = camera_data[camidx[i]]; +- Point3B& pt = point_data[ptidx[i]]; +- double dz = cam.m[2][0] * pt.xyz[0] + cam.m[2][1] * pt.xyz[1] + +- cam.m[2][2] * pt.xyz[2] + cam.t[2]; +- // double dx = cam.m[0][0] * pt.xyz[0] + cam.m[0][1] * pt.xyz[1] + +- // cam.m[0][2] * pt.xyz[2] + cam.t[0]; +- // double dy = cam.m[1][0] * pt.xyz[0] + cam.m[1][1] * pt.xyz[1] + +- // cam.m[1][2] * pt.xyz[2] + cam.t[1]; +- +- //////////////////////////////////////// +- float c[3]; +- cam.GetCameraCenter(c); +- +- CameraT camt; +- camt.SetCameraT(cam); +- Point3D ptt; +- ptt.SetPoint(pt.xyz); +- double fz = camt.m[2][0] * ptt.xyz[0] + camt.m[2][1] * ptt.xyz[1] + +- camt.m[2][2] * ptt.xyz[2] + camt.t[2]; +- double fz2 = camt.m[2][0] * (ptt.xyz[0] - c[0]) + +- camt.m[2][1] * (ptt.xyz[1] - c[1]) + +- camt.m[2][2] * (ptt.xyz[2] - c[2]); +- +- // if(dz == 0 && fz == 0) continue; +- +- if (dz * fz <= 0 || fz == 0) { +- std::cout << "cam " +- << camidx[i] //<& camera_data, +- vector& point_data, vector& ptidx, +- vector& camidx, vector& measurements, +- vector& names, vector& ptc) { +- vector zz(ptidx.size()); +- for (size_t i = 0; i < ptidx.size(); ++i) { +- CameraT& cam = camera_data[camidx[i]]; +- Point3D& pt = point_data[ptidx[i]]; +- zz[i] = cam.m[2][0] * pt.xyz[0] + cam.m[2][1] * pt.xyz[1] + +- cam.m[2][2] * pt.xyz[2] + cam.t[2]; +- } +- size_t median_idx = ptidx.size() / 2; +- std::nth_element(zz.begin(), zz.begin() + median_idx, zz.end()); +- float dist_threshold = zz[median_idx] * 0.001f; +- +- // keep removing 3D points. until all of them are infront of the cameras.. +- vector pmask(point_data.size(), true); +- int points_removed = 0; +- for (size_t i = 0; i < ptidx.size(); ++i) { +- int cid = camidx[i], pid = ptidx[i]; +- if (!pmask[pid]) continue; +- CameraT& cam = camera_data[cid]; +- Point3D& pt = point_data[pid]; +- bool visible = (cam.m[2][0] * pt.xyz[0] + cam.m[2][1] * pt.xyz[1] + +- cam.m[2][2] * pt.xyz[2] + cam.t[2] > +- dist_threshold); +- pmask[pid] = visible; // this point should be removed +- if (!visible) points_removed++; +- } +- if (points_removed == 0) return false; +- vector cv(camera_data.size(), 0); +- // should any cameras be removed ? +- int min_observation = 20; // cameras should see at leat 20 points +- +- do { +- // count visible points for each camera +- std::fill(cv.begin(), cv.end(), 0); +- for (size_t i = 0; i < ptidx.size(); ++i) { +- int cid = camidx[i], pid = ptidx[i]; +- if (pmask[pid]) cv[cid]++; +- } +- +- // check if any more points should be removed +- vector pv(point_data.size(), 0); +- for (size_t i = 0; i < ptidx.size(); ++i) { +- int cid = camidx[i], pid = ptidx[i]; +- if (!pmask[pid]) continue; // point already removed +- if (cv[cid] < min_observation) // this camera shall be removed. +- { +- /// +- } else { +- pv[pid]++; +- } +- } +- +- points_removed = 0; +- for (size_t i = 0; i < point_data.size(); ++i) { +- if (pmask[i] == false) continue; +- if (pv[i] >= 2) continue; +- pmask[i] = false; +- points_removed++; +- } +- } while (points_removed > 0); +- +- //////////////////////////////////// +- vector cmask(camera_data.size(), true); +- for (size_t i = 0; i < camera_data.size(); ++i) +- cmask[i] = cv[i] >= min_observation; +- //////////////////////////////////////////////////////// +- +- vector cidx(camera_data.size()); +- vector pidx(point_data.size()); +- +- /// modified model. +- vector camera_data2; +- vector point_data2; +- vector ptidx2; +- vector camidx2; +- vector measurements2; +- vector names2; +- vector ptc2; +- +- // +- if (names.size() < camera_data.size()) +- names.resize(camera_data.size(), string("unknown")); +- if (ptc.size() < 3 * point_data.size()) ptc.resize(point_data.size() * 3, 0); +- +- ////////////////////////////// +- int new_camera_count = 0, new_point_count = 0; +- for (size_t i = 0; i < camera_data.size(); ++i) { +- if (!cmask[i]) continue; +- camera_data2.push_back(camera_data[i]); +- names2.push_back(names[i]); +- cidx[i] = new_camera_count++; +- } +- +- for (size_t i = 0; i < point_data.size(); ++i) { +- if (!pmask[i]) continue; +- point_data2.push_back(point_data[i]); +- ptc.push_back(ptc[i]); +- pidx[i] = new_point_count++; +- } +- +- int new_observation_count = 0; +- for (size_t i = 0; i < ptidx.size(); ++i) { +- int pid = ptidx[i], cid = camidx[i]; +- if (!pmask[pid] || !cmask[cid]) continue; +- ptidx2.push_back(pidx[pid]); +- camidx2.push_back(cidx[cid]); +- measurements2.push_back(measurements[i]); +- new_observation_count++; +- } +- +- std::cout << "NOTE: removing " << (camera_data.size() - new_camera_count) +- << " cameras; " << (point_data.size() - new_point_count) +- << " 3D Points; " << (measurements.size() - new_observation_count) +- << " Observations;\n"; +- +- camera_data2.swap(camera_data); +- names2.swap(names); +- point_data2.swap(point_data); +- ptc2.swap(ptc); +- ptidx2.swap(ptidx); +- camidx2.swap(camidx); +- measurements2.swap(measurements); +- +- return true; +-} +- +-void SaveModelFile(const char* outpath, vector& camera_data, +- vector& point_data, vector& measurements, +- vector& ptidx, vector& camidx, +- vector& names, vector& ptc) { +- if (outpath == NULL) return; +- if (strstr(outpath, ".nvm")) +- SaveNVM(outpath, camera_data, point_data, measurements, ptidx, camidx, +- names, ptc); +- else if (strstr(outpath, ".out")) +- SaveBundlerOut(outpath, camera_data, point_data, measurements, ptidx, +- camidx, names, ptc); +- else +- SaveBundlerModel(outpath, camera_data, point_data, measurements, ptidx, +- camidx); +-} +- +-} // namespace pba +diff --git a/src/controllers/incremental_mapper.cc b/src/controllers/incremental_mapper.cc +index 80aa0651f..1d6091fb7 100644 +--- a/src/controllers/incremental_mapper.cc ++++ b/src/controllers/incremental_mapper.cc +@@ -63,15 +63,7 @@ void AdjustGlobalBundle(const IncrementalMapperOptions& options, + } + + PrintHeading1("Global bundle adjustment"); +- if (options.ba_global_use_pba && !options.fix_existing_images && +- num_reg_images >= kMinNumRegImagesForFastBA && +- ParallelBundleAdjuster::IsSupported(custom_ba_options, +- mapper->GetReconstruction())) { +- mapper->AdjustParallelGlobalBundle( +- custom_ba_options, options.ParallelGlobalBundleAdjustment()); +- } else { +- mapper->AdjustGlobalBundle(options.Mapper(), custom_ba_options); +- } ++ mapper->AdjustGlobalBundle(options.Mapper(), custom_ba_options); + } + + void IterativeLocalRefinement(const IncrementalMapperOptions& options, +@@ -263,18 +255,6 @@ BundleAdjustmentOptions IncrementalMapperOptions::GlobalBundleAdjustment() + return options; + } + +-ParallelBundleAdjuster::Options +-IncrementalMapperOptions::ParallelGlobalBundleAdjustment() const { +- ParallelBundleAdjuster::Options options; +- options.max_num_iterations = ba_global_max_num_iterations; +- options.print_summary = true; +- options.gpu_index = ba_global_pba_gpu_index; +- options.num_threads = num_threads; +- options.min_num_residuals_for_multi_threading = +- ba_min_num_residuals_for_multi_threading; +- return options; +-} +- + bool IncrementalMapperOptions::Check() const { + CHECK_OPTION_GT(min_num_matches, 0); + CHECK_OPTION_GT(max_num_models, 0); +diff --git a/src/controllers/incremental_mapper.h b/src/controllers/incremental_mapper.h +index 3686a58e0..f3731d1c8 100644 +--- a/src/controllers/incremental_mapper.h ++++ b/src/controllers/incremental_mapper.h +@@ -99,12 +99,6 @@ struct IncrementalMapperOptions { + // The maximum number of local bundle adjustment iterations. + int ba_local_max_num_iterations = 25; + +- // Whether to use PBA in global bundle adjustment. +- bool ba_global_use_pba = false; +- +- // The GPU index for PBA bundle adjustment. +- int ba_global_pba_gpu_index = -1; +- + // The growth rates after which to perform global bundle adjustment. + double ba_global_images_ratio = 1.1; + double ba_global_points_ratio = 1.1; +@@ -140,7 +134,6 @@ struct IncrementalMapperOptions { + IncrementalTriangulator::Options Triangulation() const; + BundleAdjustmentOptions LocalBundleAdjustment() const; + BundleAdjustmentOptions GlobalBundleAdjustment() const; +- ParallelBundleAdjuster::Options ParallelGlobalBundleAdjustment() const; + + bool Check() const; + +diff --git a/src/optim/bundle_adjustment.cc b/src/optim/bundle_adjustment.cc +index ace191426..2def3d63c 100644 +--- a/src/optim/bundle_adjustment.cc ++++ b/src/optim/bundle_adjustment.cc +@@ -529,259 +529,6 @@ void BundleAdjuster::ParameterizePoints(Reconstruction* reconstruction) { + } + } + +-//////////////////////////////////////////////////////////////////////////////// +-// ParallelBundleAdjuster +-//////////////////////////////////////////////////////////////////////////////// +- +-bool ParallelBundleAdjuster::Options::Check() const { +- CHECK_OPTION_GE(max_num_iterations, 0); +- return true; +-} +- +-ParallelBundleAdjuster::ParallelBundleAdjuster( +- const Options& options, const BundleAdjustmentOptions& ba_options, +- const BundleAdjustmentConfig& config) +- : options_(options), +- ba_options_(ba_options), +- config_(config), +- num_measurements_(0) { +- CHECK(options_.Check()); +- CHECK(ba_options_.Check()); +- CHECK_EQ(config_.NumConstantCameras(), 0) +- << "PBA does not allow to set individual cameras constant"; +- CHECK_EQ(config_.NumConstantPoses(), 0) +- << "PBA does not allow to set individual translational elements constant"; +- CHECK_EQ(config_.NumConstantTvecs(), 0) +- << "PBA does not allow to set individual translational elements constant"; +- CHECK(config_.NumVariablePoints() == 0 && config_.NumConstantPoints() == 0) +- << "PBA does not allow to parameterize individual 3D points"; +-} +- +-bool ParallelBundleAdjuster::Solve(Reconstruction* reconstruction) { +- CHECK_NOTNULL(reconstruction); +- CHECK_EQ(num_measurements_, 0) +- << "Cannot use the same ParallelBundleAdjuster multiple times"; +- CHECK(!ba_options_.refine_principal_point); +- CHECK_EQ(ba_options_.refine_focal_length, ba_options_.refine_extra_params); +- +- SetUp(reconstruction); +- +- const int num_residuals = static_cast(2 * measurements_.size()); +- +- size_t num_threads = options_.num_threads; +- if (num_residuals < options_.min_num_residuals_for_multi_threading) { +- num_threads = 1; +- } +- +- pba::ParallelBA::DeviceT device; +- const int kMaxNumResidualsFloat = 100 * 1000; +- if (num_residuals > kMaxNumResidualsFloat) { +- // The threshold for using double precision is empirically chosen and +- // ensures that the system can be reliable solved. +- device = pba::ParallelBA::PBA_CPU_DOUBLE; +- } else { +- if (options_.gpu_index < 0) { +- device = pba::ParallelBA::PBA_CUDA_DEVICE_DEFAULT; +- } else { +- device = static_cast( +- pba::ParallelBA::PBA_CUDA_DEVICE0 + options_.gpu_index); +- } +- } +- +- pba::ParallelBA pba(device, num_threads); +- +- pba.SetNextBundleMode(pba::ParallelBA::BUNDLE_FULL); +- pba.EnableRadialDistortion(pba::ParallelBA::PBA_PROJECTION_DISTORTION); +- pba.SetFixedIntrinsics(!ba_options_.refine_focal_length && +- !ba_options_.refine_extra_params); +- +- pba::ConfigBA* pba_config = pba.GetInternalConfig(); +- pba_config->__lm_delta_threshold /= 100.0f; +- pba_config->__lm_gradient_threshold /= 100.0f; +- pba_config->__lm_mse_threshold = 0.0f; +- pba_config->__cg_min_iteration = 10; +- pba_config->__verbose_level = 2; +- pba_config->__lm_max_iteration = options_.max_num_iterations; +- +- pba.SetCameraData(cameras_.size(), cameras_.data()); +- pba.SetPointData(points3D_.size(), points3D_.data()); +- pba.SetProjection(measurements_.size(), measurements_.data(), +- point3D_idxs_.data(), camera_idxs_.data()); +- +- Timer timer; +- timer.Start(); +- pba.RunBundleAdjustment(); +- timer.Pause(); +- +- // Compose Ceres solver summary from PBA options. +- summary_.num_residuals_reduced = num_residuals; +- summary_.num_effective_parameters_reduced = +- static_cast(8 * config_.NumImages() - +- 2 * config_.NumConstantCameras() + 3 * points3D_.size()); +- summary_.num_successful_steps = pba_config->GetIterationsLM() + 1; +- summary_.termination_type = ceres::TerminationType::USER_SUCCESS; +- summary_.initial_cost = +- pba_config->GetInitialMSE() * summary_.num_residuals_reduced / 4; +- summary_.final_cost = +- pba_config->GetFinalMSE() * summary_.num_residuals_reduced / 4; +- summary_.total_time_in_seconds = timer.ElapsedSeconds(); +- +- TearDown(reconstruction); +- +- if (options_.print_summary) { +- PrintHeading2("Bundle adjustment report"); +- PrintSolverSummary(summary_); +- } +- +- return true; +-} +- +-const ceres::Solver::Summary& ParallelBundleAdjuster::Summary() const { +- return summary_; +-} +- +-bool ParallelBundleAdjuster::IsSupported(const BundleAdjustmentOptions& options, +- const Reconstruction& reconstruction) { +- if (options.refine_principal_point || +- options.refine_focal_length != options.refine_extra_params) { +- return false; +- } +- +- // Check that all cameras are SIMPLE_RADIAL and that no intrinsics are shared. +- std::set camera_ids; +- for (const auto& image : reconstruction.Images()) { +- if (image.second.IsRegistered()) { +- if (camera_ids.count(image.second.CameraId()) != 0 || +- reconstruction.Camera(image.second.CameraId()).ModelId() != +- SimpleRadialCameraModel::model_id) { +- return false; +- } +- camera_ids.insert(image.second.CameraId()); +- } +- } +- return true; +-} +- +-void ParallelBundleAdjuster::SetUp(Reconstruction* reconstruction) { +- // Important: PBA requires the track of 3D points to be stored +- // contiguously, i.e. the point3D_idxs_ vector contains consecutive indices. +- cameras_.reserve(config_.NumImages()); +- camera_ids_.reserve(config_.NumImages()); +- ordered_image_ids_.reserve(config_.NumImages()); +- image_id_to_camera_idx_.reserve(config_.NumImages()); +- AddImagesToProblem(reconstruction); +- AddPointsToProblem(reconstruction); +-} +- +-void ParallelBundleAdjuster::TearDown(Reconstruction* reconstruction) { +- for (size_t i = 0; i < cameras_.size(); ++i) { +- const image_t image_id = ordered_image_ids_[i]; +- const pba::CameraT& pba_camera = cameras_[i]; +- +- // Note: Do not use PBA's quaternion methods as they seem to lead to +- // numerical instability or other issues. +- Image& image = reconstruction->Image(image_id); +- Eigen::Matrix3d rotation_matrix; +- pba_camera.GetMatrixRotation(rotation_matrix.data()); +- pba_camera.GetTranslation(image.Tvec().data()); +- image.Qvec() = RotationMatrixToQuaternion(rotation_matrix.transpose()); +- +- Camera& camera = reconstruction->Camera(image.CameraId()); +- camera.Params(0) = pba_camera.GetFocalLength(); +- camera.Params(3) = pba_camera.GetProjectionDistortion(); +- } +- +- for (size_t i = 0; i < points3D_.size(); ++i) { +- Point3D& point3D = reconstruction->Point3D(ordered_point3D_ids_[i]); +- points3D_[i].GetPoint(point3D.XYZ().data()); +- } +-} +- +-void ParallelBundleAdjuster::AddImagesToProblem( +- Reconstruction* reconstruction) { +- for (const image_t image_id : config_.Images()) { +- const Image& image = reconstruction->Image(image_id); +- CHECK_EQ(camera_ids_.count(image.CameraId()), 0) +- << "PBA does not support shared intrinsics"; +- +- const Camera& camera = reconstruction->Camera(image.CameraId()); +- CHECK_EQ(camera.ModelId(), SimpleRadialCameraModel::model_id) +- << "PBA only supports the SIMPLE_RADIAL camera model"; +- +- // Note: Do not use PBA's quaternion methods as they seem to lead to +- // numerical instability or other issues. +- const Eigen::Matrix3d rotation_matrix = +- QuaternionToRotationMatrix(image.Qvec()).transpose(); +- +- pba::CameraT pba_camera; +- pba_camera.SetFocalLength(camera.Params(0)); +- pba_camera.SetProjectionDistortion(camera.Params(3)); +- pba_camera.SetMatrixRotation(rotation_matrix.data()); +- pba_camera.SetTranslation(image.Tvec().data()); +- +- CHECK(!config_.HasConstantTvec(image_id)) +- << "PBA cannot fix partial extrinsics"; +- if (!ba_options_.refine_extrinsics || config_.HasConstantPose(image_id)) { +- CHECK(config_.IsConstantCamera(image.CameraId())) +- << "PBA cannot fix extrinsics only"; +- pba_camera.SetConstantCamera(); +- } else if (config_.IsConstantCamera(image.CameraId())) { +- pba_camera.SetFixedIntrinsic(); +- } else { +- pba_camera.SetVariableCamera(); +- } +- +- num_measurements_ += image.NumPoints3D(); +- cameras_.push_back(pba_camera); +- camera_ids_.insert(image.CameraId()); +- ordered_image_ids_.push_back(image_id); +- image_id_to_camera_idx_.emplace(image_id, +- static_cast(cameras_.size()) - 1); +- +- for (const Point2D& point2D : image.Points2D()) { +- if (point2D.HasPoint3D()) { +- point3D_ids_.insert(point2D.Point3DId()); +- } +- } +- } +-} +- +-void ParallelBundleAdjuster::AddPointsToProblem( +- Reconstruction* reconstruction) { +- points3D_.resize(point3D_ids_.size()); +- ordered_point3D_ids_.resize(point3D_ids_.size()); +- measurements_.resize(num_measurements_); +- camera_idxs_.resize(num_measurements_); +- point3D_idxs_.resize(num_measurements_); +- +- int point3D_idx = 0; +- size_t measurement_idx = 0; +- +- for (const auto point3D_id : point3D_ids_) { +- const Point3D& point3D = reconstruction->Point3D(point3D_id); +- points3D_[point3D_idx].SetPoint(point3D.XYZ().data()); +- ordered_point3D_ids_[point3D_idx] = point3D_id; +- +- for (const auto track_el : point3D.Track().Elements()) { +- if (image_id_to_camera_idx_.count(track_el.image_id) > 0) { +- const Image& image = reconstruction->Image(track_el.image_id); +- const Camera& camera = reconstruction->Camera(image.CameraId()); +- const Point2D& point2D = image.Point2D(track_el.point2D_idx); +- measurements_[measurement_idx].SetPoint2D( +- point2D.X() - camera.Params(1), point2D.Y() - camera.Params(2)); +- camera_idxs_[measurement_idx] = +- image_id_to_camera_idx_.at(track_el.image_id); +- point3D_idxs_[measurement_idx] = point3D_idx; +- measurement_idx += 1; +- } +- } +- point3D_idx += 1; +- } +- +- CHECK_EQ(point3D_idx, points3D_.size()); +- CHECK_EQ(measurement_idx, measurements_.size()); +-} +- + //////////////////////////////////////////////////////////////////////////////// + // RigBundleAdjuster + //////////////////////////////////////////////////////////////////////////////// +diff --git a/src/optim/bundle_adjustment.h b/src/optim/bundle_adjustment.h +index 8d6282ea7..13e462090 100644 +--- a/src/optim/bundle_adjustment.h ++++ b/src/optim/bundle_adjustment.h +@@ -39,7 +39,6 @@ + + #include + +-#include "PBA/pba.h" + #include "base/camera_rig.h" + #include "base/reconstruction.h" + #include "util/alignment.h" +@@ -202,71 +201,6 @@ class BundleAdjuster { + std::unordered_map point3D_num_observations_; + }; + +-// Bundle adjustment using PBA (GPU or CPU). Less flexible and accurate than +-// Ceres-Solver bundle adjustment but much faster. Only supports SimpleRadial +-// camera model. +-class ParallelBundleAdjuster { +- public: +- struct Options { +- // Whether to print a final summary. +- bool print_summary = true; +- +- // Maximum number of iterations. +- int max_num_iterations = 50; +- +- // Index of the GPU used for bundle adjustment. +- int gpu_index = -1; +- +- // Number of threads for CPU based bundle adjustment. +- int num_threads = -1; +- +- // Minimum number of residuals to enable multi-threading. Note that +- // single-threaded is typically better for small bundle adjustment problems +- // due to the overhead of threading. +- int min_num_residuals_for_multi_threading = 50000; +- +- bool Check() const; +- }; +- +- ParallelBundleAdjuster(const Options& options, +- const BundleAdjustmentOptions& ba_options, +- const BundleAdjustmentConfig& config); +- +- bool Solve(Reconstruction* reconstruction); +- +- // Get the Ceres solver summary for the last call to `Solve`. +- const ceres::Solver::Summary& Summary() const; +- +- // Check whether PBA is supported for the given reconstruction. If the +- // reconstruction is not supported, the PBA solver will exit ungracefully. +- static bool IsSupported(const BundleAdjustmentOptions& options, +- const Reconstruction& reconstruction); +- +- private: +- void SetUp(Reconstruction* reconstruction); +- void TearDown(Reconstruction* reconstruction); +- +- void AddImagesToProblem(Reconstruction* reconstruction); +- void AddPointsToProblem(Reconstruction* reconstruction); +- +- const Options options_; +- const BundleAdjustmentOptions ba_options_; +- BundleAdjustmentConfig config_; +- ceres::Solver::Summary summary_; +- +- size_t num_measurements_; +- std::vector cameras_; +- std::vector points3D_; +- std::vector measurements_; +- std::unordered_set camera_ids_; +- std::unordered_set point3D_ids_; +- std::vector camera_idxs_; +- std::vector point3D_idxs_; +- std::vector ordered_image_ids_; +- std::vector ordered_point3D_ids_; +- std::unordered_map image_id_to_camera_idx_; +-}; +- + class RigBundleAdjuster : public BundleAdjuster { + public: + struct Options { +diff --git a/src/optim/bundle_adjustment_test.cc b/src/optim/bundle_adjustment_test.cc +index 1d8ba0e6f..f1c4d6bee 100644 +--- a/src/optim/bundle_adjustment_test.cc ++++ b/src/optim/bundle_adjustment_test.cc +@@ -644,114 +644,6 @@ BOOST_AUTO_TEST_CASE(TestConstantExtraParam) { + } + } + +-BOOST_AUTO_TEST_CASE(TestParallelReconstructionSupported) { +- BundleAdjustmentOptions options; +- options.refine_focal_length = true; +- options.refine_principal_point = false; +- options.refine_extra_params = true; +- Reconstruction reconstruction; +- CorrespondenceGraph correspondence_graph; +- GenerateReconstruction(2, 100, &reconstruction, &correspondence_graph); +- BOOST_CHECK(ParallelBundleAdjuster::IsSupported(options, reconstruction)); +- +- reconstruction.Camera(0).SetModelIdFromName("SIMPLE_PINHOLE"); +- BOOST_CHECK(!ParallelBundleAdjuster::IsSupported(options, reconstruction)); +- +- reconstruction.Camera(0).SetModelIdFromName("SIMPLE_RADIAL"); +- BOOST_CHECK(ParallelBundleAdjuster::IsSupported(options, reconstruction)); +- +- options.refine_principal_point = true; +- BOOST_CHECK(!ParallelBundleAdjuster::IsSupported(options, reconstruction)); +- options.refine_principal_point = false; +- +- options.refine_focal_length = false; +- BOOST_CHECK(!ParallelBundleAdjuster::IsSupported(options, reconstruction)); +- +- options.refine_extra_params = false; +- BOOST_CHECK(ParallelBundleAdjuster::IsSupported(options, reconstruction)); +- +- options.refine_focal_length = true; +- BOOST_CHECK(!ParallelBundleAdjuster::IsSupported(options, reconstruction)); +-} +- +-BOOST_AUTO_TEST_CASE(TestParallelTwoViewVariableIntrinsics) { +- Reconstruction reconstruction; +- CorrespondenceGraph correspondence_graph; +- GenerateReconstruction(2, 100, &reconstruction, &correspondence_graph); +- const auto orig_reconstruction = reconstruction; +- +- BundleAdjustmentConfig config; +- config.AddImage(0); +- config.AddImage(1); +- +- ParallelBundleAdjuster::Options options; +- BundleAdjustmentOptions ba_options; +- ba_options.refine_focal_length = true; +- ba_options.refine_principal_point = false; +- ba_options.refine_extra_params = true; +- ParallelBundleAdjuster bundle_adjuster(options, ba_options, config); +- BOOST_REQUIRE(bundle_adjuster.Solve(&reconstruction)); +- +- const auto summary = bundle_adjuster.Summary(); +- +- // 100 points, 2 images, 2 residuals per point per image +- BOOST_CHECK_EQUAL(summary.num_residuals_reduced, 400); +- // 100 x 3 point parameters +- // + 12 image parameters +- // + 2 x 2 camera parameters +- BOOST_CHECK_EQUAL(summary.num_effective_parameters_reduced, 316); +- +- CheckVariableCamera(reconstruction.Camera(0), orig_reconstruction.Camera(0)); +- CheckVariableImage(reconstruction.Image(0), orig_reconstruction.Image(0)); +- +- CheckVariableCamera(reconstruction.Camera(1), orig_reconstruction.Camera(1)); +- CheckVariableImage(reconstruction.Image(1), orig_reconstruction.Image(1)); +- +- for (const auto& point3D : reconstruction.Points3D()) { +- CheckVariablePoint(point3D.second, +- orig_reconstruction.Point3D(point3D.first)); +- } +-} +- +-BOOST_AUTO_TEST_CASE(TestParallelTwoViewConstantIntrinsics) { +- Reconstruction reconstruction; +- CorrespondenceGraph correspondence_graph; +- GenerateReconstruction(2, 100, &reconstruction, &correspondence_graph); +- const auto orig_reconstruction = reconstruction; +- +- BundleAdjustmentConfig config; +- config.AddImage(0); +- config.AddImage(1); +- +- ParallelBundleAdjuster::Options options; +- BundleAdjustmentOptions ba_options; +- ba_options.refine_focal_length = false; +- ba_options.refine_principal_point = false; +- ba_options.refine_extra_params = false; +- ParallelBundleAdjuster bundle_adjuster(options, ba_options, config); +- BOOST_REQUIRE(bundle_adjuster.Solve(&reconstruction)); +- +- const auto summary = bundle_adjuster.Summary(); +- +- // 100 points, 2 images, 2 residuals per point per image +- BOOST_CHECK_EQUAL(summary.num_residuals_reduced, 400); +- // 100 x 3 point parameters +- // + 12 image parameters +- // + 2 x 2 camera parameters +- BOOST_CHECK_EQUAL(summary.num_effective_parameters_reduced, 316); +- +- CheckConstantCamera(reconstruction.Camera(0), orig_reconstruction.Camera(0)); +- CheckVariableImage(reconstruction.Image(0), orig_reconstruction.Image(0)); +- +- CheckConstantCamera(reconstruction.Camera(1), orig_reconstruction.Camera(1)); +- CheckVariableImage(reconstruction.Image(1), orig_reconstruction.Image(1)); +- +- for (const auto& point3D : reconstruction.Points3D()) { +- CheckVariablePoint(point3D.second, +- orig_reconstruction.Point3D(point3D.first)); +- } +-} +- + BOOST_AUTO_TEST_CASE(TestRigTwoView) { + Reconstruction reconstruction; + CorrespondenceGraph correspondence_graph; +diff --git a/src/sfm/incremental_mapper.cc b/src/sfm/incremental_mapper.cc +index 33bd82305..c1362d0f0 100644 +--- a/src/sfm/incremental_mapper.cc ++++ b/src/sfm/incremental_mapper.cc +@@ -713,39 +713,6 @@ bool IncrementalMapper::AdjustGlobalBundle( + return true; + } + +-bool IncrementalMapper::AdjustParallelGlobalBundle( +- const BundleAdjustmentOptions& ba_options, +- const ParallelBundleAdjuster::Options& parallel_ba_options) { +- CHECK_NOTNULL(reconstruction_); +- +- const std::vector& reg_image_ids = reconstruction_->RegImageIds(); +- +- CHECK_GE(reg_image_ids.size(), 2) +- << "At least two images must be registered for global bundle-adjustment"; +- +- // Avoid degeneracies in bundle adjustment. +- reconstruction_->FilterObservationsWithNegativeDepth(); +- +- // Configure bundle adjustment. +- BundleAdjustmentConfig ba_config; +- for (const image_t image_id : reg_image_ids) { +- ba_config.AddImage(image_id); +- } +- +- // Run bundle adjustment. +- ParallelBundleAdjuster bundle_adjuster(parallel_ba_options, ba_options, +- ba_config); +- if (!bundle_adjuster.Solve(reconstruction_)) { +- return false; +- } +- +- // Normalize scene for numerical stability and +- // to avoid large scale changes in viewer. +- reconstruction_->Normalize(); +- +- return true; +-} +- + size_t IncrementalMapper::FilterImages(const Options& options) { + CHECK_NOTNULL(reconstruction_); + CHECK(options.Check()); +diff --git a/src/sfm/incremental_mapper.h b/src/sfm/incremental_mapper.h +index 859194f14..5dd6fc549 100644 +--- a/src/sfm/incremental_mapper.h ++++ b/src/sfm/incremental_mapper.h +@@ -206,12 +206,9 @@ class IncrementalMapper { + const IncrementalTriangulator::Options& tri_options, + const image_t image_id, const std::unordered_set& point3D_ids); + +- // Global bundle adjustment using Ceres Solver or PBA. ++ // Global bundle adjustment using Ceres Solver. + bool AdjustGlobalBundle(const Options& options, + const BundleAdjustmentOptions& ba_options); +- bool AdjustParallelGlobalBundle( +- const BundleAdjustmentOptions& ba_options, +- const ParallelBundleAdjuster::Options& parallel_ba_options); + + // Filter images and point observations. + size_t FilterImages(const Options& options); +diff --git a/src/ui/license_widget.cc b/src/ui/license_widget.cc +index d1cedf667..def4cc8bb 100644 +--- a/src/ui/license_widget.cc ++++ b/src/ui/license_widget.cc +@@ -45,8 +45,6 @@ LicenseWidget::LicenseWidget(QWidget* parent) : QTextEdit(parent) { + licenses += "

External

"; + licenses += "

LSD

"; + licenses += GetLSDLicense(); +- licenses += "

PBA

"; +- licenses += GetPBALicense(); + licenses += "

PoissonRecon

"; + licenses += GetPoissonReconLicense(); + licenses += "

SiftGPU

"; +@@ -137,23 +135,6 @@ QString LicenseWidget::GetLSDLicense() const { + return license; + } + +-QString LicenseWidget::GetPBALicense() const { +- const QString license = +- "Copyright (c) 2011 Changchang Wu (ccwu@cs.washington.edu)
" +- "and the University of Washington at Seattle
" +- "
" +- "This library is free software; you can redistribute it and/or
" +- "modify it under the terms of the GNU General Public
" +- "License as published by the Free Software Foundation; either
" +- "Version 3 of the License, or (at your option) any later version.
" +- "
" +- "This library is distributed in the hope that it will be useful,
" +- "but WITHOUT ANY WARRANTY; without even the implied warranty of
" +- "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
" +- "General Public License for more details."; +- return license; +-} +- + QString LicenseWidget::GetPoissonReconLicense() const { + const QString license = + "The MIT License (MIT)
" +diff --git a/src/ui/license_widget.h b/src/ui/license_widget.h +index 0a43902b3..e6eca4da0 100644 +--- a/src/ui/license_widget.h ++++ b/src/ui/license_widget.h +@@ -43,7 +43,6 @@ class LicenseWidget : public QTextEdit { + private: + QString GetCOLMAPLicense() const; + QString GetLSDLicense() const; +- QString GetPBALicense() const; + QString GetPoissonReconLicense() const; + QString GetSiftGPULicense() const; + QString GetSQLiteLicense() const; +diff --git a/src/ui/reconstruction_options_widget.cc b/src/ui/reconstruction_options_widget.cc +index b9b79f8a3..3f80846a1 100644 +--- a/src/ui/reconstruction_options_widget.cc ++++ b/src/ui/reconstruction_options_widget.cc +@@ -130,15 +130,12 @@ MapperBundleAdjustmentOptionsWidget::MapperBundleAdjustmentOptionsWidget( + AddSpacer(); + + AddSection("Global Bundle Adjustment"); +- AddOptionBool(&options->mapper->ba_global_use_pba, +- "use_pba\n(requires SIMPLE_RADIAL)"); + AddOptionDouble(&options->mapper->ba_global_images_ratio, "images_ratio"); + AddOptionInt(&options->mapper->ba_global_images_freq, "images_freq"); + AddOptionDouble(&options->mapper->ba_global_points_ratio, "points_ratio"); + AddOptionInt(&options->mapper->ba_global_points_freq, "points_freq"); + AddOptionInt(&options->mapper->ba_global_max_num_iterations, + "max_num_iterations"); +- AddOptionInt(&options->mapper->ba_global_pba_gpu_index, "pba_gpu_index", -1); + AddOptionInt(&options->mapper->ba_global_max_refinements, "max_refinements", + 1); + AddOptionDouble(&options->mapper->ba_global_max_refinement_change, +diff --git a/src/util/option_manager.cc b/src/util/option_manager.cc +index e31105490..f620078af 100644 +--- a/src/util/option_manager.cc ++++ b/src/util/option_manager.cc +@@ -514,10 +514,6 @@ void OptionManager::AddMapperOptions() { + &mapper->ba_local_function_tolerance); + AddAndRegisterDefaultOption("Mapper.ba_local_max_num_iterations", + &mapper->ba_local_max_num_iterations); +- AddAndRegisterDefaultOption("Mapper.ba_global_use_pba", +- &mapper->ba_global_use_pba); +- AddAndRegisterDefaultOption("Mapper.ba_global_pba_gpu_index", +- &mapper->ba_global_pba_gpu_index); + AddAndRegisterDefaultOption("Mapper.ba_global_images_ratio", + &mapper->ba_global_images_ratio); + AddAndRegisterDefaultOption("Mapper.ba_global_points_ratio", diff --git a/recipe/1841.patch b/recipe/1841.patch new file mode 100644 index 0000000..8cd9c17 --- /dev/null +++ b/recipe/1841.patch @@ -0,0 +1,40 @@ +From 9bbf3e688996ad05c0faf8c7345a77d7be4c3263 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Johannes=20Sch=C3=B6nberger?= +Date: Sun, 12 Mar 2023 15:53:59 +0100 +Subject: [PATCH] Replace deprecated CUDA sature function call + +--- + lib/SiftGPU/ProgramCU.cu | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/lib/SiftGPU/ProgramCU.cu b/lib/SiftGPU/ProgramCU.cu +index 0b99ad066..700c26b0f 100644 +--- a/lib/SiftGPU/ProgramCU.cu ++++ b/lib/SiftGPU/ProgramCU.cu +@@ -1245,7 +1245,7 @@ void __global__ ConvertDOG_Kernel(cudaTextureObject_t texData, float* d_result, + int index = row * width + col; + float v = tex1Dfetch(texData, index); + d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)? +- 0.5 : saturate(0.5+20.0*v); ++ 0.5 : __saturatef(0.5+20.0*v); + } + } + /// +@@ -1269,7 +1269,7 @@ void __global__ ConvertGRD_Kernel(cudaTextureObject_t texData, float* d_result, + int index = row * width + col; + float v = tex1Dfetch(texData, index << 1); + d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)? +- 0 : saturate(5 * v); ++ 0 : __saturatef(5 * v); + + } + } +@@ -1297,7 +1297,7 @@ void __global__ ConvertKEY_Kernel(cudaTextureObject_t texData, cudaTextureObject + float4 keyv = tex1Dfetch(texDataF4, index); + int is_key = (keyv.x == 1.0f || keyv.x == -1.0f); + int inside = col > 0 && row > 0 && row < height -1 && col < width - 1; +- float v = inside? saturate(0.5 + 20 * tex1Dfetch(texData, index)) : 0.5; ++ float v = inside? __saturatef(0.5 + 20 * tex1Dfetch(texData, index)) : 0.5; + d_result[index] = is_key && inside ? + (keyv.x > 0? make_float4(1.0f, 0, 0, 1.0f) : make_float4(0.0f, 1.0f, 0.0f, 1.0f)): + make_float4(v, v, v, 1.0f) ; diff --git a/recipe/meta.yaml b/recipe/meta.yaml index e406d2e..34d823f 100644 --- a/recipe/meta.yaml +++ b/recipe/meta.yaml @@ -1,6 +1,6 @@ {% set name = "colmap" %} {% set version = "3.8" %} -{% set build = 2 %} +{% set build = 3 %} {% set processor = "cpu" if cuda_compiler_version == "None" else "gpu" %} # [not osx] {% set processor = "cpu" %} # [osx] @@ -19,6 +19,11 @@ source: - fix_build.patch - unvendor_vlfeat.patch # [unix] - fix_find_lz4.patch + - 1809.patch # [cuda_compiler_version == "12.0"] + - 1840.patch # [cuda_compiler_version == "12.0"] + - 1823.patch # [cuda_compiler_version == "12.0"] + - 1838.patch # [cuda_compiler_version == "12.0"] + - 1841.patch # [cuda_compiler_version == "12.0"] build: number: {{ build }} @@ -82,6 +87,7 @@ requirements: - gmp # [unix] - lz4-c - metis + - libcurand-dev # [cuda_compiler_version == "12.0"] run: - boost-cpp - vlfeat # [unix] @@ -112,7 +118,7 @@ about: license_file: - COPYING.txt - lib/LSD/LICENSE - - lib/PBA/LICENSE + - lib/PBA/LICENSE # [cuda_compiler_version != "12.0"] - lib/PoissonRecon/LICENSE - lib/SiftGPU/LICENSE summary: COLMAP is a general-purpose Structure-from-Motion (SfM) and Multi-View Stereo (MVS) pipeline with a graphical and command-line interface.