diff --git a/.azure-pipelines/azure-pipelines-linux.yml b/.azure-pipelines/azure-pipelines-linux.yml
index be1ea3f..6aedfe0 100755
--- a/.azure-pipelines/azure-pipelines-linux.yml
+++ b/.azure-pipelines/azure-pipelines-linux.yml
@@ -20,6 +20,10 @@ jobs:
         CONFIG: linux_64_c_compiler_version12cuda_compilerNonecuda_compiler_versionNonecxx_compiler_version12
         UPLOAD_PACKAGES: 'True'
         DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cos7-x86_64
+      linux_64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.0cxx_compiler_version12:
+        CONFIG: linux_64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.0cxx_compiler_version12
+        UPLOAD_PACKAGES: 'True'
+        DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cos7-x86_64
       linux_64_c_compiler_version7cuda_compilernvcccuda_compiler_version10.2cxx_compiler_version7:
         CONFIG: linux_64_c_compiler_version7cuda_compilernvcccuda_compiler_version10.2cxx_compiler_version7
         UPLOAD_PACKAGES: 'True'
diff --git a/.ci_support/linux_64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.0cxx_compiler_version12.yaml b/.ci_support/linux_64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.0cxx_compiler_version12.yaml
new file mode 100644
index 0000000..eef5d92
--- /dev/null
+++ b/.ci_support/linux_64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.0cxx_compiler_version12.yaml
@@ -0,0 +1,66 @@
+boost_cpp:
+- 1.78.0
+c_compiler:
+- gcc
+c_compiler_version:
+- '12'
+cdt_name:
+- cos7
+channel_sources:
+- conda-forge
+channel_targets:
+- conda-forge main
+cuda_compiler:
+- cuda-nvcc
+cuda_compiler_version:
+- '12.0'
+cxx_compiler:
+- gxx
+cxx_compiler_version:
+- '12'
+docker_image:
+- quay.io/condaforge/linux-anvil-cos7-x86_64
+flann:
+- 1.9.1
+gflags:
+- '2.2'
+glew:
+- '2.1'
+glog:
+- '0.6'
+gmp:
+- '6'
+libblas:
+- 3.9 *netlib
+libcblas:
+- 3.9 *netlib
+libxcb:
+- '1.15'
+lz4_c:
+- 1.9.3
+metis:
+- '5.1'
+pin_run_as_build:
+  boost-cpp:
+    max_pin: x.x.x
+  flann:
+    max_pin: x.x.x
+  vlfeat:
+    max_pin: x.x.x
+qt_main:
+- '5.15'
+sqlite:
+- '3'
+suitesparse:
+- '5'
+target_platform:
+- linux-64
+vlfeat:
+- 0.9.21
+zip_keys:
+- - c_compiler_version
+  - cxx_compiler_version
+  - cuda_compiler
+  - cuda_compiler_version
+  - cdt_name
+  - docker_image
diff --git a/.ci_support/migrations/cuda120.yaml b/.ci_support/migrations/cuda120.yaml
new file mode 100644
index 0000000..25f0f88
--- /dev/null
+++ b/.ci_support/migrations/cuda120.yaml
@@ -0,0 +1,72 @@
+migrator_ts: 1682985063
+__migrator:
+  kind:
+    version
+  migration_number:
+    1
+  build_number:
+    1
+  paused: false
+  override_cbc_keys:
+    - cuda_compiler_stub
+  operation: key_add
+  check_solvable: false
+  primary_key: cuda_compiler_version
+  ordering:
+    cxx_compiler_version:
+      - 9
+      - 8
+      - 7
+    c_compiler_version:
+      - 9
+      - 8
+      - 7
+    fortran_compiler_version:
+      - 9
+      - 8
+      - 7
+    docker_image:
+      - quay.io/condaforge/linux-anvil-comp7              # [os.environ.get("BUILD_PLATFORM") == "linux-64"]
+      - quay.io/condaforge/linux-anvil-aarch64            # [os.environ.get("BUILD_PLATFORM") == "linux-aarch64"]
+      - quay.io/condaforge/linux-anvil-ppc64le            # [os.environ.get("BUILD_PLATFORM") == "linux-ppc64le"]
+      - quay.io/condaforge/linux-anvil-armv7l             # [os.environ.get("BUILD_PLATFORM") == "linux-armv7l"]
+      - quay.io/condaforge/linux-anvil-cuda:9.2           # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"]
+      - quay.io/condaforge/linux-anvil-cuda:10.0          # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"]
+      - quay.io/condaforge/linux-anvil-cuda:10.1          # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"]
+      - quay.io/condaforge/linux-anvil-cuda:10.2          # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"]
+      - quay.io/condaforge/linux-anvil-cuda:11.0          # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"]
+      - quay.io/condaforge/linux-anvil-cuda:11.1          # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"]
+      - quay.io/condaforge/linux-anvil-cuda:11.2          # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"]
+      - quay.io/condaforge/linux-anvil-cos7-x86_64        # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"]
+    cuda_compiler_version:
+      - None
+      - 10.2                       # [(linux64 or win) and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+      - 11.0                       # [(linux64 or win) and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+      - 11.1                       # [(linux64 or win) and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+      - 11.2                       # [(linux64 or win) and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+      - 12.0                       # [(linux64 or win) and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+  commit_message: "Rebuild for CUDA 12"
+
+cuda_compiler:                 # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+  - cuda-nvcc                  # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+
+cuda_compiler_version:         # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+  - 12.0                       # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+
+c_compiler_version:            # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+  - 12                         # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+
+cxx_compiler_version:          # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+  - 12                         # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+
+fortran_compiler_version:      # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+  - 12                         # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+
+cudnn:                         # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+  - 8                          # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+
+cdt_name:                      # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+  - cos7                       # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+
+docker_image:                                      # [os.environ.get("BUILD_PLATFORM", "").startswith("linux-") and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+  - quay.io/condaforge/linux-anvil-cos7-x86_64     # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64" and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
diff --git a/README.md b/README.md
index dc83bb8..4ab6a92 100644
--- a/README.md
+++ b/README.md
@@ -47,6 +47,13 @@ Current build status
                   <img src="https://dev.azure.com/conda-forge/feedstock-builds/_apis/build/status/colmap-feedstock?branchName=main&jobName=linux&configuration=linux%20linux_64_c_compiler_version12cuda_compilerNonecuda_compiler_versionNonecxx_compiler_version12" alt="variant">
                 </a>
               </td>
+            </tr><tr>
+              <td>linux_64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.0cxx_compiler_version12</td>
+              <td>
+                <a href="https://dev.azure.com/conda-forge/feedstock-builds/_build/latest?definitionId=13529&branchName=main">
+                  <img src="https://dev.azure.com/conda-forge/feedstock-builds/_apis/build/status/colmap-feedstock?branchName=main&jobName=linux&configuration=linux%20linux_64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.0cxx_compiler_version12" alt="variant">
+                </a>
+              </td>
             </tr><tr>
               <td>linux_64_c_compiler_version7cuda_compilernvcccuda_compiler_version10.2cxx_compiler_version7</td>
               <td>
diff --git a/recipe/1809.patch b/recipe/1809.patch
new file mode 100644
index 0000000..0958c97
--- /dev/null
+++ b/recipe/1809.patch
@@ -0,0 +1,1125 @@
+diff --git a/src/mvs/cuda_array_wrapper.h b/src/mvs/cuda_array_wrapper.h
+deleted file mode 100644
+index e4e48b0e8..000000000
+--- a/src/mvs/cuda_array_wrapper.h
++++ /dev/null
+@@ -1,171 +0,0 @@
+-// Copyright (c) 2023, ETH Zurich and UNC Chapel Hill.
+-// All rights reserved.
+-//
+-// Redistribution and use in source and binary forms, with or without
+-// modification, are permitted provided that the following conditions are met:
+-//
+-//     * Redistributions of source code must retain the above copyright
+-//       notice, this list of conditions and the following disclaimer.
+-//
+-//     * Redistributions in binary form must reproduce the above copyright
+-//       notice, this list of conditions and the following disclaimer in the
+-//       documentation and/or other materials provided with the distribution.
+-//
+-//     * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of
+-//       its contributors may be used to endorse or promote products derived
+-//       from this software without specific prior written permission.
+-//
+-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+-// POSSIBILITY OF SUCH DAMAGE.
+-//
+-// Author: Johannes L. Schoenberger (jsch-at-demuc-dot-de)
+-
+-#ifndef COLMAP_SRC_MVS_CUDA_ARRAY_WRAPPER_H_
+-#define COLMAP_SRC_MVS_CUDA_ARRAY_WRAPPER_H_
+-
+-#include <memory>
+-
+-#include <cuda_runtime.h>
+-
+-#include "mvs/gpu_mat.h"
+-#include "util/cudacc.h"
+-
+-namespace colmap {
+-namespace mvs {
+-
+-template <typename T>
+-class CudaArrayWrapper {
+- public:
+-  CudaArrayWrapper(const size_t width, const size_t height, const size_t depth);
+-  ~CudaArrayWrapper();
+-
+-  const cudaArray* GetPtr() const;
+-  cudaArray* GetPtr();
+-
+-  size_t GetWidth() const;
+-  size_t GetHeight() const;
+-  size_t GetDepth() const;
+-
+-  void CopyToDevice(const T* data);
+-  void CopyToHost(const T* data);
+-  void CopyFromGpuMat(const GpuMat<T>& array);
+-
+- private:
+-  // Define class as non-copyable and non-movable.
+-  CudaArrayWrapper(CudaArrayWrapper const&) = delete;
+-  void operator=(CudaArrayWrapper const& obj) = delete;
+-  CudaArrayWrapper(CudaArrayWrapper&&) = delete;
+-
+-  void Allocate();
+-  void Deallocate();
+-
+-  cudaArray* array_;
+-
+-  size_t width_;
+-  size_t height_;
+-  size_t depth_;
+-};
+-
+-////////////////////////////////////////////////////////////////////////////////
+-// Implementation
+-////////////////////////////////////////////////////////////////////////////////
+-
+-template <typename T>
+-CudaArrayWrapper<T>::CudaArrayWrapper(const size_t width, const size_t height,
+-                                      const size_t depth)
+-    : width_(width), height_(height), depth_(depth), array_(nullptr) {}
+-
+-template <typename T>
+-CudaArrayWrapper<T>::~CudaArrayWrapper() {
+-  Deallocate();
+-}
+-
+-template <typename T>
+-const cudaArray* CudaArrayWrapper<T>::GetPtr() const {
+-  return array_;
+-}
+-
+-template <typename T>
+-cudaArray* CudaArrayWrapper<T>::GetPtr() {
+-  return array_;
+-}
+-
+-template <typename T>
+-size_t CudaArrayWrapper<T>::GetWidth() const {
+-  return width_;
+-}
+-
+-template <typename T>
+-size_t CudaArrayWrapper<T>::GetHeight() const {
+-  return height_;
+-}
+-
+-template <typename T>
+-size_t CudaArrayWrapper<T>::GetDepth() const {
+-  return depth_;
+-}
+-
+-template <typename T>
+-void CudaArrayWrapper<T>::CopyToDevice(const T* data) {
+-  cudaMemcpy3DParms params = {0};
+-  Allocate();
+-  params.extent = make_cudaExtent(width_, height_, depth_);
+-  params.kind = cudaMemcpyHostToDevice;
+-  params.dstArray = array_;
+-  params.srcPtr =
+-      make_cudaPitchedPtr((void*)data, width_ * sizeof(T), width_, height_);
+-  CUDA_SAFE_CALL(cudaMemcpy3D(&params));
+-}
+-
+-template <typename T>
+-void CudaArrayWrapper<T>::CopyToHost(const T* data) {
+-  cudaMemcpy3DParms params = {0};
+-  params.extent = make_cudaExtent(width_, height_, depth_);
+-  params.kind = cudaMemcpyDeviceToHost;
+-  params.dstPtr =
+-      make_cudaPitchedPtr((void*)data, width_ * sizeof(T), width_, height_);
+-  params.srcArray = array_;
+-  CUDA_SAFE_CALL(cudaMemcpy3D(&params));
+-}
+-
+-template <typename T>
+-void CudaArrayWrapper<T>::CopyFromGpuMat(const GpuMat<T>& array) {
+-  Allocate();
+-  cudaMemcpy3DParms parameters = {0};
+-  parameters.extent = make_cudaExtent(width_, height_, depth_);
+-  parameters.kind = cudaMemcpyDeviceToDevice;
+-  parameters.dstArray = array_;
+-  parameters.srcPtr = make_cudaPitchedPtr((void*)array.GetPtr(),
+-                                          array.GetPitch(), width_, height_);
+-  CUDA_SAFE_CALL(cudaMemcpy3D(&parameters));
+-}
+-
+-template <typename T>
+-void CudaArrayWrapper<T>::Allocate() {
+-  Deallocate();
+-  struct cudaExtent extent = make_cudaExtent(width_, height_, depth_);
+-  cudaChannelFormatDesc fmt = cudaCreateChannelDesc<T>();
+-  CUDA_SAFE_CALL(cudaMalloc3DArray(&array_, &fmt, extent, cudaArrayLayered));
+-}
+-
+-template <typename T>
+-void CudaArrayWrapper<T>::Deallocate() {
+-  if (array_ != nullptr) {
+-    CUDA_SAFE_CALL(cudaFreeArray(array_));
+-    array_ = nullptr;
+-  }
+-}
+-
+-}  // namespace mvs
+-}  // namespace colmap
+-
+-#endif  // COLMAP_SRC_MVS_CUDA_ARRAY_WRAPPER_H_
+diff --git a/src/mvs/cuda_texture.h b/src/mvs/cuda_texture.h
+new file mode 100644
+index 000000000..3dcd8d171
+--- /dev/null
++++ b/src/mvs/cuda_texture.h
+@@ -0,0 +1,180 @@
++// Copyright (c) 2023, ETH Zurich and UNC Chapel Hill.
++// All rights reserved.
++//
++// Redistribution and use in source and binary forms, with or without
++// modification, are permitted provided that the following conditions are met:
++//
++//     * Redistributions of source code must retain the above copyright
++//       notice, this list of conditions and the following disclaimer.
++//
++//     * Redistributions in binary form must reproduce the above copyright
++//       notice, this list of conditions and the following disclaimer in the
++//       documentation and/or other materials provided with the distribution.
++//
++//     * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of
++//       its contributors may be used to endorse or promote products derived
++//       from this software without specific prior written permission.
++//
++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
++// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
++// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
++// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
++// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
++// POSSIBILITY OF SUCH DAMAGE.
++//
++// Author: Johannes L. Schoenberger (jsch-at-demuc-dot-de)
++
++#ifndef COLMAP_SRC_MVS_CUDA_TEXTURE_H_
++#define COLMAP_SRC_MVS_CUDA_TEXTURE_H_
++
++#include <memory>
++
++#include <cuda_runtime.h>
++
++#include "mvs/gpu_mat.h"
++#include "util/cudacc.h"
++#include "util/logging.h"
++
++namespace colmap {
++namespace mvs {
++
++template <typename T>
++class CudaArrayLayeredTexture {
++ public:
++  static std::unique_ptr<CudaArrayLayeredTexture<T>> FromGpuMat(
++      const cudaTextureDesc& texture_desc, const GpuMat<T>& mat);
++  static std::unique_ptr<CudaArrayLayeredTexture<T>> FromHostArray(
++      const cudaTextureDesc& texture_desc, const size_t width,
++      const size_t height, const size_t depth, const T* data);
++
++  cudaTextureObject_t GetObj() const;
++
++  size_t GetWidth() const;
++  size_t GetHeight() const;
++  size_t GetDepth() const;
++
++  CudaArrayLayeredTexture(const cudaTextureDesc& texture_desc,
++                          const size_t width, const size_t height,
++                          const size_t depth);
++  ~CudaArrayLayeredTexture();
++
++ private:
++  // Define class as non-copyable and non-movable.
++  CudaArrayLayeredTexture(CudaArrayLayeredTexture const&) = delete;
++  void operator=(CudaArrayLayeredTexture const& obj) = delete;
++  CudaArrayLayeredTexture(CudaArrayLayeredTexture&&) = delete;
++
++  const size_t width_;
++  const size_t height_;
++  const size_t depth_;
++
++  cudaArray_t array_;
++  const cudaTextureDesc texture_desc_;
++  cudaResourceDesc resource_desc_;
++  cudaTextureObject_t texture_;
++};
++
++////////////////////////////////////////////////////////////////////////////////
++// Implementation
++////////////////////////////////////////////////////////////////////////////////
++
++template <typename T>
++std::unique_ptr<CudaArrayLayeredTexture<T>>
++CudaArrayLayeredTexture<T>::FromGpuMat(const cudaTextureDesc& texture_desc,
++                                       const GpuMat<T>& mat) {
++  auto array = std::make_unique<CudaArrayLayeredTexture<T>>(
++      texture_desc, mat.GetWidth(), mat.GetHeight(), mat.GetDepth());
++
++  cudaMemcpy3DParms params;
++  memset(&params, 0, sizeof(params));
++  params.extent =
++      make_cudaExtent(mat.GetWidth(), mat.GetHeight(), mat.GetDepth());
++  params.kind = cudaMemcpyDeviceToDevice;
++  params.srcPtr = make_cudaPitchedPtr((void*)mat.GetPtr(), mat.GetPitch(),
++                                      mat.GetWidth(), mat.GetHeight());
++  params.dstArray = array->array_;
++  CUDA_SAFE_CALL(cudaMemcpy3D(&params));
++
++  return array;
++}
++
++template <typename T>
++std::unique_ptr<CudaArrayLayeredTexture<T>>
++CudaArrayLayeredTexture<T>::FromHostArray(const cudaTextureDesc& texture_desc,
++                                          const size_t width,
++                                          const size_t height,
++                                          const size_t depth, const T* data) {
++  auto array = std::make_unique<CudaArrayLayeredTexture<T>>(texture_desc, width,
++                                                            height, depth);
++
++  cudaMemcpy3DParms params;
++  memset(&params, 0, sizeof(params));
++  params.extent = make_cudaExtent(width, height, depth);
++  params.kind = cudaMemcpyHostToDevice;
++  params.srcPtr =
++      make_cudaPitchedPtr((void*)data, width * sizeof(T), width, height);
++  params.dstArray = array->array_;
++  CUDA_SAFE_CALL(cudaMemcpy3D(&params));
++
++  return array;
++}
++
++template <typename T>
++CudaArrayLayeredTexture<T>::CudaArrayLayeredTexture(
++    const cudaTextureDesc& texture_desc, const size_t width,
++    const size_t height, const size_t depth)
++    : texture_desc_(texture_desc),
++      width_(width),
++      height_(height),
++      depth_(depth) {
++  CHECK_GT(width_, 0);
++  CHECK_GT(height_, 0);
++  CHECK_GT(depth_, 0);
++
++  cudaExtent extent = make_cudaExtent(width_, height_, depth_);
++  cudaChannelFormatDesc fmt = cudaCreateChannelDesc<T>();
++  CUDA_SAFE_CALL(cudaMalloc3DArray(&array_, &fmt, extent, cudaArrayLayered));
++
++  memset(&resource_desc_, 0, sizeof(resource_desc_));
++  resource_desc_.resType = cudaResourceTypeArray;
++  resource_desc_.res.array.array = array_;
++
++  CUDA_SAFE_CALL(cudaCreateTextureObject(&texture_, &resource_desc_,
++                                         &texture_desc_, nullptr));
++}
++
++template <typename T>
++CudaArrayLayeredTexture<T>::~CudaArrayLayeredTexture() {
++  CUDA_SAFE_CALL(cudaFreeArray(array_));
++  CUDA_SAFE_CALL(cudaDestroyTextureObject(texture_));
++}
++
++template <typename T>
++cudaTextureObject_t CudaArrayLayeredTexture<T>::GetObj() const {
++  return texture_;
++}
++
++template <typename T>
++size_t CudaArrayLayeredTexture<T>::GetWidth() const {
++  return width_;
++}
++
++template <typename T>
++size_t CudaArrayLayeredTexture<T>::GetHeight() const {
++  return height_;
++}
++
++template <typename T>
++size_t CudaArrayLayeredTexture<T>::GetDepth() const {
++  return depth_;
++}
++
++}  // namespace mvs
++}  // namespace colmap
++
++#endif  // COLMAP_SRC_MVS_CUDA_TEXTURE_H_
+diff --git a/src/mvs/gpu_mat_ref_image.cu b/src/mvs/gpu_mat_ref_image.cu
+index c40a10bc3..1e3cc9d5e 100644
+--- a/src/mvs/gpu_mat_ref_image.cu
++++ b/src/mvs/gpu_mat_ref_image.cu
+@@ -39,9 +39,8 @@ namespace colmap {
+ namespace mvs {
+ namespace {
+ 
+-texture<uint8_t, cudaTextureType2D, cudaReadModeNormalizedFloat> image_texture;
+-
+-__global__ void FilterKernel(GpuMat<uint8_t> image, GpuMat<float> sum_image,
++__global__ void FilterKernel(const cudaTextureObject_t image_texture,
++                             GpuMat<uint8_t> image, GpuMat<float> sum_image,
+                              GpuMat<float> squared_sum_image,
+                              const int window_radius, const int window_step,
+                              const float sigma_spatial,
+@@ -54,7 +53,7 @@ __global__ void FilterKernel(GpuMat<uint8_t> image, GpuMat<float> sum_image,
+ 
+   BilateralWeightComputer bilateral_weight_computer(sigma_spatial, sigma_color);
+ 
+-  const float center_color = tex2D(image_texture, col, row);
++  const float center_color = tex2D<float>(image_texture, col, row);
+ 
+   float color_sum = 0.0f;
+   float color_squared_sum = 0.0f;
+@@ -65,7 +64,7 @@ __global__ void FilterKernel(GpuMat<uint8_t> image, GpuMat<float> sum_image,
+     for (int window_col = -window_radius; window_col <= window_radius;
+          window_col += window_step) {
+       const float color =
+-          tex2D(image_texture, col + window_col, row + window_row);
++          tex2D<float>(image_texture, col + window_col, row + window_row);
+       const float bilateral_weight = bilateral_weight_computer.Compute(
+           window_row, window_col, center_color, color);
+       color_sum += bilateral_weight * color;
+@@ -95,24 +94,25 @@ void GpuMatRefImage::Filter(const uint8_t* image_data,
+                             const size_t window_radius,
+                             const size_t window_step, const float sigma_spatial,
+                             const float sigma_color) {
+-  CudaArrayWrapper<uint8_t> image_array(width_, height_, 1);
+-  image_array.CopyToDevice(image_data);
+-  image_texture.addressMode[0] = cudaAddressModeBorder;
+-  image_texture.addressMode[1] = cudaAddressModeBorder;
+-  image_texture.addressMode[2] = cudaAddressModeBorder;
+-  image_texture.filterMode = cudaFilterModePoint;
+-  image_texture.normalized = false;
++  cudaTextureDesc texture_desc;
++  memset(&texture_desc, 0, sizeof(texture_desc));
++  texture_desc.addressMode[0] = cudaAddressModeBorder;
++  texture_desc.addressMode[1] = cudaAddressModeBorder;
++  texture_desc.addressMode[2] = cudaAddressModeBorder;
++  texture_desc.filterMode = cudaFilterModePoint;
++  texture_desc.readMode = cudaReadModeNormalizedFloat;
++  texture_desc.normalizedCoords = false;
++  auto image_texture = CudaArrayLayeredTexture<uint8_t>::FromHostArray(
++      texture_desc, width_, height_, 1, image_data);
+ 
+   const dim3 block_size(kBlockDimX, kBlockDimY);
+   const dim3 grid_size((width_ - 1) / block_size.x + 1,
+                        (height_ - 1) / block_size.y + 1);
+ 
+-  CUDA_SAFE_CALL(cudaBindTextureToArray(image_texture, image_array.GetPtr()));
+   FilterKernel<<<grid_size, block_size>>>(
+-      *image, *sum_image, *squared_sum_image, window_radius, window_step,
+-      sigma_spatial, sigma_color);
++      image_texture->GetObj(), *image, *sum_image, *squared_sum_image,
++      window_radius, window_step, sigma_spatial, sigma_color);
+   CUDA_SYNC_AND_CHECK();
+-  CUDA_SAFE_CALL(cudaUnbindTexture(image_texture));
+ }
+ 
+ }  // namespace mvs
+diff --git a/src/mvs/gpu_mat_ref_image.h b/src/mvs/gpu_mat_ref_image.h
+index 1e04e5f43..4f4be34e6 100644
+--- a/src/mvs/gpu_mat_ref_image.h
++++ b/src/mvs/gpu_mat_ref_image.h
+@@ -34,7 +34,7 @@
+ 
+ #include <memory>
+ 
+-#include "mvs/cuda_array_wrapper.h"
++#include "mvs/cuda_texture.h"
+ #include "mvs/gpu_mat.h"
+ 
+ namespace colmap {
+@@ -64,8 +64,8 @@ class GpuMatRefImage {
+   const static size_t kBlockDimX = 16;
+   const static size_t kBlockDimY = 12;
+ 
+-  size_t width_;
+-  size_t height_;
++  const size_t width_;
++  const size_t height_;
+ };
+ 
+ struct BilateralWeightComputer {
+diff --git a/src/mvs/patch_match_cuda.cu b/src/mvs/patch_match_cuda.cu
+index 845fffa94..772f341a1 100644
+--- a/src/mvs/patch_match_cuda.cu
++++ b/src/mvs/patch_match_cuda.cu
+@@ -56,14 +56,6 @@
+ namespace colmap {
+ namespace mvs {
+ 
+-texture<uint8_t, cudaTextureType2D, cudaReadModeNormalizedFloat>
+-    ref_image_texture;
+-texture<uint8_t, cudaTextureType2DLayered, cudaReadModeNormalizedFloat>
+-    src_images_texture;
+-texture<float, cudaTextureType2DLayered, cudaReadModeElementType>
+-    src_depth_maps_texture;
+-texture<float, cudaTextureType2D, cudaReadModeElementType> poses_texture;
+-
+ // Calibration of reference image as {fx, cx, fy, cy}.
+ __constant__ float ref_K[4];
+ // Calibration of reference image as {1/fx, -cx/fx, 1/fy, -cy/fy}.
+@@ -229,18 +221,17 @@ __device__ inline float PropagateDepth(const float depth1,
+ // First, compute triangulation angle between reference and source image for 3D
+ // point. Second, compute incident angle between viewing direction of source
+ // image and normal direction of 3D point. Both angles are cosine distances.
+-__device__ inline void ComputeViewingAngles(const float point[3],
+-                                            const float normal[3],
+-                                            const int image_idx,
+-                                            float* cos_triangulation_angle,
+-                                            float* cos_incident_angle) {
++__device__ inline void ComputeViewingAngles(
++    const cudaTextureObject_t poses_texture, const float point[3],
++    const float normal[3], const int image_idx, float* cos_triangulation_angle,
++    float* cos_incident_angle) {
+   *cos_triangulation_angle = 0.0f;
+   *cos_incident_angle = 0.0f;
+ 
+   // Projection center of source image.
+   float C[3];
+   for (int i = 0; i < 3; ++i) {
+-    C[i] = tex2D(poses_texture, i + 16, image_idx);
++    C[i] = tex2D<float>(poses_texture, i + 16, image_idx);
+   }
+ 
+   // Ray from point to camera.
+@@ -256,25 +247,25 @@ __device__ inline void ComputeViewingAngles(const float point[3],
+   *cos_triangulation_angle = DotProduct3(SX, point) * RX_inv_norm * SX_inv_norm;
+ }
+ 
+-__device__ inline void ComposeHomography(const int image_idx, const int row,
+-                                         const int col, const float depth,
+-                                         const float normal[3], float H[9]) {
++__device__ inline void ComposeHomography(
++    const cudaTextureObject_t poses_texture, const int image_idx, const int row,
++    const int col, const float depth, const float normal[3], float H[9]) {
+   // Calibration of source image.
+   float K[4];
+   for (int i = 0; i < 4; ++i) {
+-    K[i] = tex2D(poses_texture, i, image_idx);
++    K[i] = tex2D<float>(poses_texture, i, image_idx);
+   }
+ 
+   // Relative rotation between reference and source image.
+   float R[9];
+   for (int i = 0; i < 9; ++i) {
+-    R[i] = tex2D(poses_texture, i + 4, image_idx);
++    R[i] = tex2D<float>(poses_texture, i + 4, image_idx);
+   }
+ 
+   // Relative translation between reference and source image.
+   float T[3];
+   for (int i = 0; i < 3; ++i) {
+-    T[i] = tex2D(poses_texture, i + 13, image_idx);
++    T[i] = tex2D<float>(poses_texture, i + 13, image_idx);
+   }
+ 
+   // Distance to the plane.
+@@ -332,6 +323,9 @@ struct LocalRefImage {
+   const static int kNumColumns = kThreadBlockSize * THREADS_PER_BLOCK;
+   const static int kDataSize = kNumRows * kNumColumns;
+ 
++  __device__ explicit LocalRefImage(const cudaTextureObject_t ref_image_texture)
++      : ref_image_texture_(ref_image_texture) {}
++
+   float* data = nullptr;
+ 
+   __device__ inline void Read(const int row) {
+@@ -357,7 +351,7 @@ struct LocalRefImage {
+ #pragma unroll
+         for (int block = 0; block < kThreadBlockSize; ++block) {
+           data[local_row * kNumColumns + local_col] =
+-              tex2D(ref_image_texture, global_col, global_row);
++              tex2D<float>(ref_image_texture_, global_col, global_row);
+           local_col += THREADS_PER_BLOCK;
+           global_col += THREADS_PER_BLOCK;
+         }
+@@ -382,12 +376,15 @@ struct LocalRefImage {
+ #pragma unroll
+       for (int block = 0; block < kThreadBlockSize; ++block) {
+         data[local_row * kNumColumns + local_col] =
+-            tex2D(ref_image_texture, global_col, global_row);
++            tex2D<float>(ref_image_texture_, global_col, global_row);
+         local_col += THREADS_PER_BLOCK;
+         global_col += THREADS_PER_BLOCK;
+       }
+     }
+   }
++
++ private:
++  const cudaTextureObject_t ref_image_texture_;
+ };
+ 
+ // The return values is 1 - NCC, so the range is [0, 2], the smaller the
+@@ -396,9 +393,15 @@ template <int kWindowSize, int kWindowStep>
+ struct PhotoConsistencyCostComputer {
+   const static int kWindowRadius = kWindowSize / 2;
+ 
+-  __device__ PhotoConsistencyCostComputer(const float sigma_spatial,
+-                                          const float sigma_color)
+-      : bilateral_weight_computer_(sigma_spatial, sigma_color) {}
++  __device__ PhotoConsistencyCostComputer(
++      const cudaTextureObject_t ref_image_texture,
++      const cudaTextureObject_t src_images_texture,
++      const cudaTextureObject_t poses_texture, const float sigma_spatial,
++      const float sigma_color)
++      : local_ref_image(ref_image_texture),
++        src_images_texture_(src_images_texture),
++        poses_texture_(poses_texture),
++        bilateral_weight_computer_(sigma_spatial, sigma_color) {}
+ 
+   // Maximum photo consistency cost as 1 - min(NCC).
+   const float kMaxCost = 2.0f;
+@@ -429,7 +432,8 @@ struct PhotoConsistencyCostComputer {
+ 
+   __device__ inline float Compute() const {
+     float tform[9];
+-    ComposeHomography(src_image_idx, row, col, depth, normal, tform);
++    ComposeHomography(poses_texture_, src_image_idx, row, col, depth, normal,
++                      tform);
+ 
+     float tform_step[8];
+     for (int i = 0; i < 8; ++i) {
+@@ -467,8 +471,8 @@ struct PhotoConsistencyCostComputer {
+         const float norm_col_src = inv_z * col_src + 0.5f;
+         const float norm_row_src = inv_z * row_src + 0.5f;
+         const float ref_color = local_ref_image.data[ref_image_idx];
+-        const float src_color = tex2DLayered(src_images_texture, norm_col_src,
+-                                             norm_row_src, src_image_idx);
++        const float src_color = tex2DLayered<float>(
++            src_images_texture_, norm_col_src, norm_row_src, src_image_idx);
+ 
+         const float bilateral_weight = bilateral_weight_computer_.Compute(
+             row, col, ref_center_color, ref_color);
+@@ -528,22 +532,24 @@ struct PhotoConsistencyCostComputer {
+   }
+ 
+  private:
++  const cudaTextureObject_t src_images_texture_;
++  const cudaTextureObject_t poses_texture_;
+   const BilateralWeightComputer bilateral_weight_computer_;
+ };
+ 
+-__device__ inline float ComputeGeomConsistencyCost(const float row,
+-                                                   const float col,
+-                                                   const float depth,
+-                                                   const int image_idx,
+-                                                   const float max_cost) {
++__device__ inline float ComputeGeomConsistencyCost(
++    const cudaTextureObject_t poses_texture,
++    const cudaTextureObject_t src_depth_maps_texture, const float row,
++    const float col, const float depth, const int image_idx,
++    const float max_cost) {
+   // Extract projection matrices for source image.
+   float P[12];
+   for (int i = 0; i < 12; ++i) {
+-    P[i] = tex2D(poses_texture, i + 19, image_idx);
++    P[i] = tex2D<float>(poses_texture, i + 19, image_idx);
+   }
+   float inv_P[12];
+   for (int i = 0; i < 12; ++i) {
+-    inv_P[i] = tex2D(poses_texture, i + 31, image_idx);
++    inv_P[i] = tex2D<float>(poses_texture, i + 31, image_idx);
+   }
+ 
+   // Project point in reference image to world.
+@@ -562,8 +568,8 @@ __device__ inline float ComputeGeomConsistencyCost(const float row,
+                        P[6] * forward_point[2] + P[7]);
+ 
+   // Extract depth in source image.
+-  const float src_depth = tex2DLayered(src_depth_maps_texture, src_col + 0.5f,
+-                                       src_row + 0.5f, image_idx);
++  const float src_depth = tex2DLayered<float>(
++      src_depth_maps_texture, src_col + 0.5f, src_row + 0.5f, image_idx);
+ 
+   // Projection outside of source image.
+   if (src_depth == 0.0f) {
+@@ -794,15 +800,20 @@ template <int kWindowSize, int kWindowStep>
+ __global__ void ComputeInitialCost(GpuMat<float> cost_map,
+                                    const GpuMat<float> depth_map,
+                                    const GpuMat<float> normal_map,
++                                   const cudaTextureObject_t ref_image_texture,
+                                    const GpuMat<float> ref_sum_image,
+                                    const GpuMat<float> ref_squared_sum_image,
++                                   const cudaTextureObject_t src_images_texture,
++                                   const cudaTextureObject_t poses_texture,
+                                    const float sigma_spatial,
+                                    const float sigma_color) {
+   const int col = blockDim.x * blockIdx.x + threadIdx.x;
+ 
+   typedef PhotoConsistencyCostComputer<kWindowSize, kWindowStep>
+       PhotoConsistencyCostComputerType;
+-  PhotoConsistencyCostComputerType pcc_computer(sigma_spatial, sigma_color);
++  PhotoConsistencyCostComputerType pcc_computer(
++      ref_image_texture, src_images_texture, poses_texture, sigma_spatial,
++      sigma_color);
+   pcc_computer.col = col;
+ 
+   __shared__ float local_ref_image_data
+@@ -859,8 +870,13 @@ __global__ void SweepFromTopToBottom(
+     GpuMat<float> global_workspace, GpuMat<curandState> rand_state_map,
+     GpuMat<float> cost_map, GpuMat<float> depth_map, GpuMat<float> normal_map,
+     GpuMat<uint8_t> consistency_mask, GpuMat<float> sel_prob_map,
+-    const GpuMat<float> prev_sel_prob_map, const GpuMat<float> ref_sum_image,
+-    const GpuMat<float> ref_squared_sum_image, const SweepOptions options) {
++    const GpuMat<float> prev_sel_prob_map,
++    const cudaTextureObject_t ref_image_texture,
++    const GpuMat<float> ref_sum_image,
++    const GpuMat<float> ref_squared_sum_image,
++    const cudaTextureObject_t src_images_texture,
++    const cudaTextureObject_t src_depth_maps_texture,
++    const cudaTextureObject_t poses_texture, const SweepOptions options) {
+   const int col = blockDim.x * blockIdx.x + threadIdx.x;
+ 
+   // Probability for boundary pixels.
+@@ -904,8 +920,9 @@ __global__ void SweepFromTopToBottom(
+ 
+   typedef PhotoConsistencyCostComputer<kWindowSize, kWindowStep>
+       PhotoConsistencyCostComputerType;
+-  PhotoConsistencyCostComputerType pcc_computer(options.sigma_spatial,
+-                                                options.sigma_color);
++  PhotoConsistencyCostComputerType pcc_computer(
++      ref_image_texture, src_images_texture, poses_texture,
++      options.sigma_spatial, options.sigma_color);
+   pcc_computer.col = col;
+ 
+   __shared__ float local_ref_image_data
+@@ -982,16 +999,17 @@ __global__ void SweepFromTopToBottom(
+ 
+       float cos_triangulation_angle;
+       float cos_incident_angle;
+-      ComputeViewingAngles(point, curr_param_state.normal, image_idx,
+-                           &cos_triangulation_angle, &cos_incident_angle);
++      ComputeViewingAngles(poses_texture, point, curr_param_state.normal,
++                           image_idx, &cos_triangulation_angle,
++                           &cos_incident_angle);
+       const float tri_prob =
+           likelihood_computer.ComputeTriProb(cos_triangulation_angle);
+       const float inc_prob =
+           likelihood_computer.ComputeIncProb(cos_incident_angle);
+ 
+       float H[9];
+-      ComposeHomography(image_idx, row, col, curr_param_state.depth,
+-                        curr_param_state.normal, H);
++      ComposeHomography(poses_texture, image_idx, row, col,
++                        curr_param_state.depth, curr_param_state.normal, H);
+       const float res_prob =
+           likelihood_computer.ComputeResolutionProb<kWindowSize>(H, row, col);
+ 
+@@ -1035,10 +1053,11 @@ __global__ void SweepFromTopToBottom(
+ 
+       costs[0] += cost_map.Get(row, col, pcc_computer.src_image_idx);
+       if (kGeomConsistencyTerm) {
+-        costs[0] += options.geom_consistency_regularizer *
+-                    ComputeGeomConsistencyCost(
+-                        row, col, depths[0], pcc_computer.src_image_idx,
+-                        options.geom_consistency_max_cost);
++        costs[0] +=
++            options.geom_consistency_regularizer *
++            ComputeGeomConsistencyCost(
++                poses_texture, src_depth_maps_texture, row, col, depths[0],
++                pcc_computer.src_image_idx, options.geom_consistency_max_cost);
+       }
+ 
+       for (int i = 1; i < kNumCosts; ++i) {
+@@ -1048,7 +1067,8 @@ __global__ void SweepFromTopToBottom(
+         if (kGeomConsistencyTerm) {
+           costs[i] += options.geom_consistency_regularizer *
+                       ComputeGeomConsistencyCost(
+-                          row, col, depths[i], pcc_computer.src_image_idx,
++                          poses_texture, src_depth_maps_texture, row, col,
++                          depths[i], pcc_computer.src_image_idx,
+                           options.geom_consistency_max_cost);
+         }
+       }
+@@ -1102,7 +1122,7 @@ __global__ void SweepFromTopToBottom(
+       for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) {
+         float cos_triangulation_angle;
+         float cos_incident_angle;
+-        ComputeViewingAngles(best_point, best_normal, image_idx,
++        ComputeViewingAngles(poses_texture, best_point, best_normal, image_idx,
+                              &cos_triangulation_angle, &cos_incident_angle);
+         if (cos_triangulation_angle > cos_min_triangulation_angle ||
+             cos_incident_angle <= 0.0f) {
+@@ -1115,7 +1135,8 @@ __global__ void SweepFromTopToBottom(
+             num_consistent += 1;
+           }
+         } else if (!kFilterPhotoConsistency) {
+-          if (ComputeGeomConsistencyCost(row, col, best_depth, image_idx,
++          if (ComputeGeomConsistencyCost(poses_texture, src_depth_maps_texture,
++                                         row, col, best_depth, image_idx,
+                                          options.geom_consistency_max_cost) <=
+               options.filter_geom_consistency_max_cost) {
+             consistency_mask.Set(row, col, image_idx, 1);
+@@ -1123,7 +1144,8 @@ __global__ void SweepFromTopToBottom(
+           }
+         } else {
+           if (sel_prob_map.Get(row, col, image_idx) >= min_ncc_prob &&
+-              ComputeGeomConsistencyCost(row, col, best_depth, image_idx,
++              ComputeGeomConsistencyCost(poses_texture, src_depth_maps_texture,
++                                         row, col, best_depth, image_idx,
+                                          options.geom_consistency_max_cost) <=
+                   options.filter_geom_consistency_max_cost) {
+             consistency_mask.Set(row, col, image_idx, 1);
+@@ -1169,53 +1191,49 @@ PatchMatchCuda::PatchMatchCuda(const PatchMatchOptions& options,
+   InitWorkspaceMemory();
+ }
+ 
+-PatchMatchCuda::~PatchMatchCuda() {
+-  for (size_t i = 0; i < 4; ++i) {
+-    poses_device_[i].reset();
+-  }
+-}
+-
+ void PatchMatchCuda::Run() {
+ #define CASE_WINDOW_RADIUS(window_radius, window_step)              \
+   case window_radius:                                               \
+     RunWithWindowSizeAndStep<2 * window_radius + 1, window_step>(); \
+     break;
+ 
+-#define CASE_WINDOW_STEP(window_step)                                 \
+-  case window_step:                                                   \
+-    switch (options_.window_radius) {                                 \
+-      CASE_WINDOW_RADIUS(1, window_step)                              \
+-      CASE_WINDOW_RADIUS(2, window_step)                              \
+-      CASE_WINDOW_RADIUS(3, window_step)                              \
+-      CASE_WINDOW_RADIUS(4, window_step)                              \
+-      CASE_WINDOW_RADIUS(5, window_step)                              \
+-      CASE_WINDOW_RADIUS(6, window_step)                              \
+-      CASE_WINDOW_RADIUS(7, window_step)                              \
+-      CASE_WINDOW_RADIUS(8, window_step)                              \
+-      CASE_WINDOW_RADIUS(9, window_step)                              \
+-      CASE_WINDOW_RADIUS(10, window_step)                             \
+-      CASE_WINDOW_RADIUS(11, window_step)                             \
+-      CASE_WINDOW_RADIUS(12, window_step)                             \
+-      CASE_WINDOW_RADIUS(13, window_step)                             \
+-      CASE_WINDOW_RADIUS(14, window_step)                             \
+-      CASE_WINDOW_RADIUS(15, window_step)                             \
+-      CASE_WINDOW_RADIUS(16, window_step)                             \
+-      CASE_WINDOW_RADIUS(17, window_step)                             \
+-      CASE_WINDOW_RADIUS(18, window_step)                             \
+-      CASE_WINDOW_RADIUS(19, window_step)                             \
+-      CASE_WINDOW_RADIUS(20, window_step)                             \
+-      default: {                                                      \
+-        std::cerr << "Error: Window size not supported" << std::endl; \
+-        break;                                                        \
+-      }                                                               \
+-    }                                                                 \
++#define CASE_WINDOW_STEP(window_step)                                \
++  case window_step:                                                  \
++    switch (options_.window_radius) {                                \
++      CASE_WINDOW_RADIUS(1, window_step)                             \
++      CASE_WINDOW_RADIUS(2, window_step)                             \
++      CASE_WINDOW_RADIUS(3, window_step)                             \
++      CASE_WINDOW_RADIUS(4, window_step)                             \
++      CASE_WINDOW_RADIUS(5, window_step)                             \
++      CASE_WINDOW_RADIUS(6, window_step)                             \
++      CASE_WINDOW_RADIUS(7, window_step)                             \
++      CASE_WINDOW_RADIUS(8, window_step)                             \
++      CASE_WINDOW_RADIUS(9, window_step)                             \
++      CASE_WINDOW_RADIUS(10, window_step)                            \
++      CASE_WINDOW_RADIUS(11, window_step)                            \
++      CASE_WINDOW_RADIUS(12, window_step)                            \
++      CASE_WINDOW_RADIUS(13, window_step)                            \
++      CASE_WINDOW_RADIUS(14, window_step)                            \
++      CASE_WINDOW_RADIUS(15, window_step)                            \
++      CASE_WINDOW_RADIUS(16, window_step)                            \
++      CASE_WINDOW_RADIUS(17, window_step)                            \
++      CASE_WINDOW_RADIUS(18, window_step)                            \
++      CASE_WINDOW_RADIUS(19, window_step)                            \
++      CASE_WINDOW_RADIUS(20, window_step)                            \
++      default: {                                                     \
++        std::cerr << "Error: Window size " << options_.window_radius \
++                  << " not supported" << std::endl;                  \
++        break;                                                       \
++      }                                                              \
++    }                                                                \
+     break;
+ 
+   switch (options_.window_step) {
+     CASE_WINDOW_STEP(1)
+     CASE_WINDOW_STEP(2)
+     default: {
+-      std::cerr << "Error: Window step not supported" << std::endl;
++      std::cerr << "Error: Window step " << options_.window_step
++                << " not supported" << std::endl;
+       break;
+     }
+   }
+@@ -1274,9 +1292,10 @@ void PatchMatchCuda::RunWithWindowSizeAndStep() {
+   ComputeCudaConfig();
+   ComputeInitialCost<kWindowSize, kWindowStep>
+       <<<sweep_grid_size_, sweep_block_size_>>>(
+-          *cost_map_, *depth_map_, *normal_map_, *ref_image_->sum_image,
+-          *ref_image_->squared_sum_image, options_.sigma_spatial,
+-          options_.sigma_color);
++          *cost_map_, *depth_map_, *normal_map_, ref_image_texture_->GetObj(),
++          *ref_image_->sum_image, *ref_image_->squared_sum_image,
++          src_images_texture_->GetObj(), poses_texture_[0]->GetObj(),
++          options_.sigma_spatial, options_.sigma_color);
+   CUDA_SYNC_AND_CHECK();
+ 
+   init_timer.Print("Initialization");
+@@ -1324,8 +1343,13 @@ void PatchMatchCuda::RunWithWindowSizeAndStep() {
+       <<<sweep_grid_size_, sweep_block_size_>>>(                         \
+           *global_workspace_, *rand_state_map_, *cost_map_, *depth_map_, \
+           *normal_map_, *consistency_mask_, *sel_prob_map_,              \
+-          *prev_sel_prob_map_, *ref_image_->sum_image,                   \
+-          *ref_image_->squared_sum_image, sweep_options);
++          *prev_sel_prob_map_, ref_image_texture_->GetObj(),             \
++          *ref_image_->sum_image, *ref_image_->squared_sum_image,        \
++          src_images_texture_->GetObj(),                                 \
++          src_depth_maps_texture_ == nullptr                             \
++              ? 0                                                        \
++              : src_depth_maps_texture_->GetObj(),                       \
++          poses_texture_[rotation_in_half_pi_]->GetObj(), sweep_options);
+ 
+       if (last_sweep) {
+         if (options_.filter) {
+@@ -1410,13 +1434,26 @@ void PatchMatchCuda::ComputeCudaConfig() {
+   elem_wise_grid_size_.z = 1;
+ }
+ 
++void PatchMatchCuda::BindRefImageTexture() {
++  cudaTextureDesc texture_desc;
++  memset(&texture_desc, 0, sizeof(texture_desc));
++  texture_desc.addressMode[0] = cudaAddressModeBorder;
++  texture_desc.addressMode[1] = cudaAddressModeBorder;
++  texture_desc.addressMode[2] = cudaAddressModeBorder;
++  texture_desc.filterMode = cudaFilterModePoint;
++  texture_desc.readMode = cudaReadModeNormalizedFloat;
++  texture_desc.normalizedCoords = false;
++  ref_image_texture_ = CudaArrayLayeredTexture<uint8_t>::FromGpuMat(
++      texture_desc, *ref_image_->image);
++}
++
+ void PatchMatchCuda::InitRefImage() {
+   const Image& ref_image = problem_.images->at(problem_.ref_image_idx);
+ 
+   ref_width_ = ref_image.GetWidth();
+   ref_height_ = ref_image.GetHeight();
+ 
+-  // Upload to device.
++  // Upload to device and filter.
+   ref_image_.reset(new GpuMatRefImage(ref_width_, ref_height_));
+   const std::vector<uint8_t> ref_image_array =
+       ref_image.GetBitmap().ConvertToRowMajorArray();
+@@ -1424,18 +1461,7 @@ void PatchMatchCuda::InitRefImage() {
+                      options_.window_step, options_.sigma_spatial,
+                      options_.sigma_color);
+ 
+-  ref_image_device_.reset(
+-      new CudaArrayWrapper<uint8_t>(ref_width_, ref_height_, 1));
+-  ref_image_device_->CopyFromGpuMat(*ref_image_->image);
+-
+-  // Create texture.
+-  ref_image_texture.addressMode[0] = cudaAddressModeBorder;
+-  ref_image_texture.addressMode[1] = cudaAddressModeBorder;
+-  ref_image_texture.addressMode[2] = cudaAddressModeBorder;
+-  ref_image_texture.filterMode = cudaFilterModePoint;
+-  ref_image_texture.normalized = false;
+-  CUDA_SAFE_CALL(
+-      cudaBindTextureToArray(ref_image_texture, ref_image_device_->GetPtr()));
++  BindRefImageTexture();
+ }
+ 
+ void PatchMatchCuda::InitSourceImages() {
+@@ -1470,19 +1496,18 @@ void PatchMatchCuda::InitSourceImages() {
+       }
+     }
+ 
+-    // Upload to device.
+-    src_images_device_.reset(new CudaArrayWrapper<uint8_t>(
+-        max_width, max_height, problem_.src_image_idxs.size()));
+-    src_images_device_->CopyToDevice(src_images_host_data.data());
+-
+     // Create source images texture.
+-    src_images_texture.addressMode[0] = cudaAddressModeBorder;
+-    src_images_texture.addressMode[1] = cudaAddressModeBorder;
+-    src_images_texture.addressMode[2] = cudaAddressModeBorder;
+-    src_images_texture.filterMode = cudaFilterModeLinear;
+-    src_images_texture.normalized = false;
+-    CUDA_SAFE_CALL(cudaBindTextureToArray(src_images_texture,
+-                                          src_images_device_->GetPtr()));
++    cudaTextureDesc texture_desc;
++    memset(&texture_desc, 0, sizeof(texture_desc));
++    texture_desc.addressMode[0] = cudaAddressModeBorder;
++    texture_desc.addressMode[1] = cudaAddressModeBorder;
++    texture_desc.addressMode[2] = cudaAddressModeBorder;
++    texture_desc.filterMode = cudaFilterModeLinear;
++    texture_desc.readMode = cudaReadModeNormalizedFloat;
++    texture_desc.normalizedCoords = false;
++    src_images_texture_ = CudaArrayLayeredTexture<uint8_t>::FromHostArray(
++        texture_desc, max_width, max_height, problem_.src_image_idxs.size(),
++        src_images_host_data.data());
+   }
+ 
+   // Upload source depth maps to device.
+@@ -1504,19 +1529,18 @@ void PatchMatchCuda::InitSourceImages() {
+       }
+     }
+ 
+-    src_depth_maps_device_.reset(new CudaArrayWrapper<float>(
+-        max_width, max_height, problem_.src_image_idxs.size()));
+-    src_depth_maps_device_->CopyToDevice(src_depth_maps_host_data.data());
+-
+     // Create source depth maps texture.
+-    src_depth_maps_texture.addressMode[0] = cudaAddressModeBorder;
+-    src_depth_maps_texture.addressMode[1] = cudaAddressModeBorder;
+-    src_depth_maps_texture.addressMode[2] = cudaAddressModeBorder;
+-    // TODO: Check if linear interpolation improves results or not.
+-    src_depth_maps_texture.filterMode = cudaFilterModePoint;
+-    src_depth_maps_texture.normalized = false;
+-    CUDA_SAFE_CALL(cudaBindTextureToArray(src_depth_maps_texture,
+-                                          src_depth_maps_device_->GetPtr()));
++    cudaTextureDesc texture_desc;
++    memset(&texture_desc, 0, sizeof(texture_desc));
++    texture_desc.addressMode[0] = cudaAddressModeBorder;
++    texture_desc.addressMode[1] = cudaAddressModeBorder;
++    texture_desc.addressMode[2] = cudaAddressModeBorder;
++    texture_desc.filterMode = cudaFilterModePoint;
++    texture_desc.readMode = cudaReadModeElementType;
++    texture_desc.normalizedCoords = false;
++    src_depth_maps_texture_ = CudaArrayLayeredTexture<float>::FromHostArray(
++        texture_desc, max_width, max_height, problem_.src_image_idxs.size(),
++        src_depth_maps_host_data.data());
+   }
+ }
+ 
+@@ -1576,6 +1600,15 @@ void PatchMatchCuda::InitTransforms() {
+   // Matrix for 90deg rotation around Z-axis in counter-clockwise direction.
+   const float R_z90[9] = {0, 1, 0, -1, 0, 0, 0, 0, 1};
+ 
++  cudaTextureDesc texture_desc;
++  memset(&texture_desc, 0, sizeof(texture_desc));
++  texture_desc.addressMode[0] = cudaAddressModeBorder;
++  texture_desc.addressMode[1] = cudaAddressModeBorder;
++  texture_desc.addressMode[2] = cudaAddressModeBorder;
++  texture_desc.filterMode = cudaFilterModePoint;
++  texture_desc.readMode = cudaReadModeElementType;
++  texture_desc.normalizedCoords = false;
++
+   for (size_t i = 0; i < 4; ++i) {
+     const size_t kNumTformParams = 4 + 9 + 3 + 3 + 12 + 12;
+     std::vector<float> poses_host_data(kNumTformParams *
+@@ -1614,20 +1647,12 @@ void PatchMatchCuda::InitTransforms() {
+       offset += 12;
+     }
+ 
+-    poses_device_[i].reset(new CudaArrayWrapper<float>(
+-        kNumTformParams, problem_.src_image_idxs.size(), 1));
+-    poses_device_[i]->CopyToDevice(poses_host_data.data());
++    poses_texture_[i] = CudaArrayLayeredTexture<float>::FromHostArray(
++        texture_desc, kNumTformParams, problem_.src_image_idxs.size(), 1,
++        poses_host_data.data());
+ 
+     RotatePose(R_z90, rotated_R, rotated_T);
+   }
+-
+-  poses_texture.addressMode[0] = cudaAddressModeBorder;
+-  poses_texture.addressMode[1] = cudaAddressModeBorder;
+-  poses_texture.addressMode[2] = cudaAddressModeBorder;
+-  poses_texture.filterMode = cudaFilterModePoint;
+-  poses_texture.normalized = false;
+-  CUDA_SAFE_CALL(
+-      cudaBindTextureToArray(poses_texture, poses_device_[0]->GetPtr()));
+ }
+ 
+ void PatchMatchCuda::InitWorkspaceMemory() {
+@@ -1727,15 +1752,9 @@ void PatchMatchCuda::Rotate() {
+     ref_image_->squared_sum_image->Rotate(
+         rotated_ref_image->squared_sum_image.get());
+     ref_image_.swap(rotated_ref_image);
++    BindRefImageTexture();
+   }
+ 
+-  // Bind rotated reference image to texture.
+-  ref_image_device_.reset(new CudaArrayWrapper<uint8_t>(width, height, 1));
+-  ref_image_device_->CopyFromGpuMat(*ref_image_->image);
+-  CUDA_SAFE_CALL(cudaUnbindTexture(ref_image_texture));
+-  CUDA_SAFE_CALL(
+-      cudaBindTextureToArray(ref_image_texture, ref_image_device_->GetPtr()));
+-
+   // Rotate selection probability map.
+   prev_sel_prob_map_.reset(
+       new GpuMat<float>(width, height, problem_.src_image_idxs.size()));
+@@ -1751,11 +1770,6 @@ void PatchMatchCuda::Rotate() {
+     cost_map_.swap(rotated_cost_map);
+   }
+ 
+-  // Rotate transformations.
+-  CUDA_SAFE_CALL(cudaUnbindTexture(poses_texture));
+-  CUDA_SAFE_CALL(cudaBindTextureToArray(
+-      poses_texture, poses_device_[rotation_in_half_pi_]->GetPtr()));
+-
+   // Rotate calibration.
+   CUDA_SAFE_CALL(cudaMemcpyToSymbol(ref_K, ref_K_host_[rotation_in_half_pi_],
+                                     sizeof(float) * 4, 0,
+diff --git a/src/mvs/patch_match_cuda.h b/src/mvs/patch_match_cuda.h
+index adbecdbd9..9e85e9b32 100644
+--- a/src/mvs/patch_match_cuda.h
++++ b/src/mvs/patch_match_cuda.h
+@@ -38,7 +38,7 @@
+ 
+ #include <cuda_runtime.h>
+ 
+-#include "mvs/cuda_array_wrapper.h"
++#include "mvs/cuda_texture.h"
+ #include "mvs/depth_map.h"
+ #include "mvs/gpu_mat.h"
+ #include "mvs/gpu_mat_prng.h"
+@@ -54,7 +54,6 @@ class PatchMatchCuda {
+  public:
+   PatchMatchCuda(const PatchMatchOptions& options,
+                  const PatchMatch::Problem& problem);
+-  ~PatchMatchCuda();
+ 
+   void Run();
+ 
+@@ -69,6 +68,8 @@ class PatchMatchCuda {
+ 
+   void ComputeCudaConfig();
+ 
++  void BindRefImageTexture();
++
+   void InitRefImage();
+   void InitSourceImages();
+   void InitTransforms();
+@@ -96,9 +97,9 @@ class PatchMatchCuda {
+   int rotation_in_half_pi_;
+ 
+   // Reference and source image input data.
+-  std::unique_ptr<CudaArrayWrapper<uint8_t>> ref_image_device_;
+-  std::unique_ptr<CudaArrayWrapper<uint8_t>> src_images_device_;
+-  std::unique_ptr<CudaArrayWrapper<float>> src_depth_maps_device_;
++  std::unique_ptr<CudaArrayLayeredTexture<uint8_t>> ref_image_texture_;
++  std::unique_ptr<CudaArrayLayeredTexture<uint8_t>> src_images_texture_;
++  std::unique_ptr<CudaArrayLayeredTexture<float>> src_depth_maps_texture_;
+ 
+   // Relative poses from rotated versions of reference image to source images
+   // corresponding to _rotationInHalfPi:
+@@ -114,7 +115,7 @@ class PatchMatchCuda {
+   // R, T, C, P, P^-1 denote the relative rotation, translation, camera
+   // center, projection, and inverse projection from there reference to the
+   // i-th source image.
+-  std::unique_ptr<CudaArrayWrapper<float>> poses_device_[4];
++  std::unique_ptr<CudaArrayLayeredTexture<float>> poses_texture_[4];
+ 
+   // Calibration matrix for rotated versions of reference image
+   // as {K[0, 0], K[0, 2], K[1, 1], K[1, 2]} corresponding to _rotationInHalfPi.
diff --git a/recipe/1823.patch b/recipe/1823.patch
new file mode 100644
index 0000000..2c31a89
--- /dev/null
+++ b/recipe/1823.patch
@@ -0,0 +1,39 @@
+From 821a85ba0d96ce5f53a15d8a538de268b891b3d0 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Johannes=20Sch=C3=B6nberger?= <joschonb@microsoft.com>
+Date: Wed, 1 Mar 2023 22:10:07 +0100
+Subject: [PATCH] Remove unused SIFT GPU cuda texture reference
+
+---
+ lib/SiftGPU/ProgramCU.cu | 6 ------
+ 1 file changed, 6 deletions(-)
+
+diff --git a/lib/SiftGPU/ProgramCU.cu b/lib/SiftGPU/ProgramCU.cu
+index 51c781341..9d842770d 100644
+--- a/lib/SiftGPU/ProgramCU.cu
++++ b/lib/SiftGPU/ProgramCU.cu
+@@ -1683,9 +1683,6 @@ void ProgramCU::MultiplyDescriptorG(CuTexImage* des1, CuTexImage* des2,
+ 												MatH, hdistmax, MatF, fdistmax);
+ }
+ 
+-
+-texture<int,  1, cudaReadModeElementType> texDOT;
+-
+ #define ROWMATCH_BLOCK_WIDTH 32
+ #define ROWMATCH_BLOCK_HEIGHT 1
+ 
+@@ -1755,15 +1752,12 @@ void ProgramCU::GetRowMatch(CuTexImage* texDot, CuTexImage* texMatch, float dist
+ 	int num2 = texDot->GetImgWidth();
+ 	dim3 grid(1, num1/ROWMATCH_BLOCK_HEIGHT);
+ 	dim3 block(ROWMATCH_BLOCK_WIDTH, ROWMATCH_BLOCK_HEIGHT);
+-	// texDot->BindTexture(texDOT);
+ 	RowMatch_Kernel<<<grid, block>>>((int*)texDot->_cuData,
+ 		(int*)texMatch->_cuData, num2, distmax, ratiomax);
+ }
+ 
+ #define COLMATCH_BLOCK_WIDTH 32
+ 
+-//texture<int3,  1, cudaReadModeElementType> texCT;
+-
+ void __global__  ColMatch_Kernel(int3*d_crt, int* d_result, int height, int num2, float distmax, float ratiomax)
+ {
+ 	int col = COLMATCH_BLOCK_WIDTH * blockIdx.x + threadIdx.x;
diff --git a/recipe/1838.patch b/recipe/1838.patch
new file mode 100644
index 0000000..84ae322
--- /dev/null
+++ b/recipe/1838.patch
@@ -0,0 +1,1148 @@
+From d361730f19a675e1f60b3fe45333441deb0e3d1d Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Johannes=20Sch=C3=B6nberger?= <joschonb@microsoft.com>
+Date: Sat, 11 Mar 2023 22:28:18 +0100
+Subject: [PATCH 01/14] [WIP] Upgrade SiftGPU to use CUDA texture objects
+
+---
+ lib/SiftGPU/CuTexImage.cpp |  54 +++++++++++++++++++-
+ lib/SiftGPU/CuTexImage.h   |  12 ++++-
+ lib/SiftGPU/ProgramCU.cu   | 101 ++++++++++++++++++++-----------------
+ 3 files changed, 118 insertions(+), 49 deletions(-)
+
+diff --git a/lib/SiftGPU/CuTexImage.cpp b/lib/SiftGPU/CuTexImage.cpp
+index a4ef59bba..be0383d2b 100644
+--- a/lib/SiftGPU/CuTexImage.cpp
++++ b/lib/SiftGPU/CuTexImage.cpp
+@@ -27,6 +27,7 @@
+ #include <algorithm>
+ #include <stdlib.h>
+ #include <math.h>
++#include <cstring>
+ using namespace std;
+ 
+ 
+@@ -39,10 +40,48 @@ using namespace std;
+ #include "CuTexImage.h"
+ #include "ProgramCU.h"
+ 
+-#if CUDA_VERSION <= 2010 && defined(SIFTGPU_ENABLE_LINEAR_TEX2D)
+-#error "Require CUDA 2.2 or higher"
+-#endif
++CuTexImage::CuTexObj::~CuTexObj()
++{
++	cudaDestroyTextureObject(handle);
++}
++
++CuTexImage::CuTexObj CuTexImage::BindTexture(const cudaTextureDesc& textureDesc,
++											   										 const cudaChannelFormatDesc& channelFmtDesc)
++{
++	CuTexObj texObj;
+ 
++	cudaResourceDesc resourceDesc;
++	memset(&resourceDesc, 0, sizeof(resourceDesc));
++  resourceDesc.resType = cudaResourceTypeLinear;
++  resourceDesc.res.linear.devPtr = _cuData;
++	resourceDesc.res.linear.desc = channelFmtDesc;
++	resourceDesc.res.linear.sizeInBytes = _numBytes;
++
++	cudaCreateTextureObject(&texObj.handle, &resourceDesc, &textureDesc, nullptr);
++	ProgramCU::CheckErrorCUDA("CuTexImage::BindTexture");
++
++	return texObj;
++}
++
++CuTexImage::CuTexObj CuTexImage::BindTexture2D(const cudaTextureDesc& textureDesc,
++											   											 const cudaChannelFormatDesc& channelFmtDesc)
++{
++	CuTexObj texObj;
++
++	cudaResourceDesc resourceDesc;
++	memset(&resourceDesc, 0, sizeof(resourceDesc));
++	resourceDesc.resType = cudaResourceTypePitch2D;
++  resourceDesc.res.pitch2D.devPtr = _cuData;
++	resourceDesc.res.pitch2D.width = _imgWidth;
++	resourceDesc.res.pitch2D.height = _imgHeight;
++	resourceDesc.res.pitch2D.pitchInBytes = _imgWidth * _numChannel * sizeof(float);
++	resourceDesc.res.pitch2D.desc = channelFmtDesc;
++
++	cudaCreateTextureObject(&texObj.handle, &resourceDesc, &textureDesc, nullptr);
++	ProgramCU::CheckErrorCUDA("CuTexImage::BindTexture2D");
++
++	return texObj;
++}
+ 
+ CuTexImage::CuTexImage()
+ {
+@@ -171,69 +210,6 @@ void CuTexImage::CopyToHost(void * buf, int stream)
+ 	cudaMemcpyAsync(buf, _cuData, _imgWidth * _imgHeight * _numChannel * sizeof(float), cudaMemcpyDeviceToHost, (cudaStream_t)stream);
+ }
+ 
+-void CuTexImage::InitTexture2D()
+-{
+-#if !defined(SIFTGPU_ENABLE_LINEAR_TEX2D)
+-	if(_cuData2D && (_texWidth < _imgWidth || _texHeight < _imgHeight))
+-	{
+-		cudaFreeArray(_cuData2D);
+-		_cuData2D = NULL;
+-	}
+-
+-	if(_cuData2D == NULL)
+-	{
+-		_texWidth = max(_texWidth, _imgWidth);
+-		_texHeight = max(_texHeight, _imgHeight);
+-		cudaChannelFormatDesc desc;
+-		desc.f = cudaChannelFormatKindFloat;
+-		desc.x = sizeof(float) * 8;
+-		desc.y = _numChannel >=2 ? sizeof(float) * 8 : 0;
+-		desc.z = _numChannel >=3 ? sizeof(float) * 8 : 0;
+-		desc.w = _numChannel >=4 ? sizeof(float) * 8 : 0;
+-		const cudaError_t status = cudaMallocArray(&_cuData2D, &desc, _texWidth, _texHeight);
+-
+-    if (status != cudaSuccess) {
+-      _cuData = NULL;
+-      _numBytes = 0;
+-    }
+-
+-		ProgramCU::CheckErrorCUDA("CuTexImage::InitTexture2D");
+-	}
+-#endif
+-}
+-
+-void CuTexImage::CopyToTexture2D()
+-{
+-#if !defined(SIFTGPU_ENABLE_LINEAR_TEX2D)
+-	InitTexture2D();
+-
+-	if(_cuData2D)
+-	{
+-		cudaMemcpy2DToArray(_cuData2D, 0, 0, _cuData, _imgWidth* _numChannel* sizeof(float) ,
+-		_imgWidth * _numChannel*sizeof(float), _imgHeight,	cudaMemcpyDeviceToDevice);
+-		ProgramCU::CheckErrorCUDA("cudaMemcpy2DToArray");
+-	}
+-#endif
+-
+-}
+-
+-int CuTexImage::DebugCopyToTexture2D()
+-{
+-
+-/*	CuTexImage tex;
+-	float data1[2][3] = {{1, 2, 5}, {3, 4, 5}}, data2[2][5];
+-	tex.InitTexture(3, 2, 1);
+-	cudaMemcpy(tex._cuData, data1[0], 6 * sizeof(float), cudaMemcpyHostToDevice);
+-	cudaMemcpy(data1, tex._cuData, 4 * sizeof(float) , cudaMemcpyDeviceToHost);
+-	tex._texWidth =5;  tex._texHeight = 2;
+-	tex.CopyToTexture2D();
+-	cudaMemcpyFromArray(data2[0], tex._cuData2D, 0, 0, 10 * sizeof(float), cudaMemcpyDeviceToHost);*/
+-
+-	return 1;
+-}
+-
+-
+-
+ void CuTexImage::CopyFromPBO(int width, int height, GLuint pbo)
+ {
+ 	void* pbuf =NULL;
+diff --git a/lib/SiftGPU/CuTexImage.h b/lib/SiftGPU/CuTexImage.h
+index 0d62f6d07..1303b24cc 100644
+--- a/lib/SiftGPU/CuTexImage.h
++++ b/lib/SiftGPU/CuTexImage.h
+@@ -24,13 +24,9 @@
+ #ifndef CU_TEX_IMAGE_H
+ #define CU_TEX_IMAGE_H
+ 
+-class GLTexImage;
+-struct cudaArray;
+-struct textureReference;
+-
+-//using texture2D from linear memory
++#include <cuda_runtime.h>
+ 
+-#define SIFTGPU_ENABLE_LINEAR_TEX2D
++class GLTexImage;
+ 
+ class CuTexImage
+ {
+@@ -45,18 +41,23 @@ class CuTexImage
+ 	int			_texHeight;
+ 	GLuint		_fromPBO;
+ public:
++	struct CuTexObj
++	{
++		cudaTextureObject_t handle;
++		~CuTexObj();
++	};
++
+ 	virtual void SetImageSize(int width, int height);
+ 	virtual bool InitTexture(int width, int height, int nchannel = 1);
+-	void InitTexture2D();
+-	inline void BindTexture(textureReference& texRef);
+-	inline void BindTexture2D(textureReference& texRef);
+-	void CopyToTexture2D();
++	CuTexObj BindTexture(const cudaTextureDesc& textureDesc,
++											 const cudaChannelFormatDesc& channelFmtDesc);
++	CuTexObj BindTexture2D(const cudaTextureDesc& textureDesc,
++											   const cudaChannelFormatDesc& channelFmtDesc);
+ 	void CopyToHost(void* buf);
+ 	void CopyToHost(void* buf, int stream);
+ 	void CopyFromHost(const void* buf);
+ 	int  CopyToPBO(GLuint pbo);
+ 	void CopyFromPBO(int width, int height, GLuint pbo);
+-	static int DebugCopyToTexture2D();
+ public:
+ 	inline int GetImgWidth(){return _imgWidth;}
+ 	inline int GetImgHeight(){return _imgHeight;}
+diff --git a/lib/SiftGPU/ProgramCU.cu b/lib/SiftGPU/ProgramCU.cu
+index 9d842770d..0b99ad066 100644
+--- a/lib/SiftGPU/ProgramCU.cu
++++ b/lib/SiftGPU/ProgramCU.cu
+@@ -98,19 +98,33 @@
+ 
+ 
+ __device__ __constant__ float d_kernel[KERNEL_MAX_WIDTH];
+-texture<float, 1, cudaReadModeElementType> texData;
+-texture<unsigned char, 1, cudaReadModeNormalizedFloat> texDataB;
+-texture<float2, 2, cudaReadModeElementType> texDataF2;
+-texture<float4, 1, cudaReadModeElementType> texDataF4;
+-texture<int4, 1, cudaReadModeElementType> texDataI4;
+-texture<int4, 1, cudaReadModeElementType> texDataList;
+-
+-//template<int i>	 __device__ float Conv(float *data)		{    return Conv<i-1>(data) + data[i]*d_kernel[i];}
+-//template<>		__device__ float Conv<0>(float *data)	{    return data[0] * d_kernel[0];					}
+ 
++const static cudaTextureDesc texDataDesc = []() {
++  cudaTextureDesc textureDesc;
++  memset(&textureDesc, 0, sizeof(textureDesc));
++  textureDesc.readMode = cudaReadModeElementType;
++  textureDesc.addressMode[0] = cudaAddressModeClamp;
++  textureDesc.addressMode[1] = cudaAddressModeClamp;
++  textureDesc.addressMode[2] = cudaAddressModeClamp;
++  textureDesc.filterMode = cudaFilterModePoint;
++  textureDesc.normalizedCoords = false;
++  return textureDesc;
++}();
++
++const static cudaTextureDesc texDataBDesc = []() {
++  cudaTextureDesc textureDesc;
++  memset(&textureDesc, 0, sizeof(textureDesc));
++  textureDesc.readMode = cudaReadModeNormalizedFloat;
++  textureDesc.addressMode[0] = cudaAddressModeClamp;
++  textureDesc.addressMode[1] = cudaAddressModeClamp;
++  textureDesc.addressMode[2] = cudaAddressModeClamp;
++  textureDesc.filterMode = cudaFilterModePoint;
++  textureDesc.normalizedCoords = false;
++  return textureDesc;
++}();
+ 
+ //////////////////////////////////////////////////////////////
+-template<int FW> __global__ void FilterH( float* d_result, int width)
++template<int FW> __global__ void FilterH(cudaTextureObject_t texData, float* d_result, int width)
+ {
+ 
+ 	const int HALF_WIDTH = FW >> 1;
+@@ -130,7 +144,7 @@ template<int FW> __global__ void FilterH( float* d_result, int width)
+ 		if(cache_index < CACHE_WIDTH)
+ 		{
+ 			int fetch_index = src_index < index_min? index_min : (src_index > index_max ? index_max : src_index);
+-			data[cache_index] = tex1Dfetch(texData,fetch_index);
++			data[cache_index] = tex1Dfetch<float>(texData,fetch_index);
+ 			src_index += FILTERH_TILE_WIDTH;
+ 			cache_index += FILTERH_TILE_WIDTH;
+ 		}
+@@ -149,7 +163,7 @@ template<int FW> __global__ void FilterH( float* d_result, int width)
+ 
+ 
+ ////////////////////////////////////////////////////////////////////
+-template<int  FW>  __global__ void FilterV(float* d_result, int width, int height)
++template<int  FW>  __global__ void FilterV(cudaTextureObject_t texData, float* d_result, int width, int height)
+ {
+ 	const int HALF_WIDTH = FW >> 1;
+ 	const int CACHE_WIDTH = FW + FILTERV_TILE_HEIGHT - 1;
+@@ -188,7 +202,7 @@ template<int  FW>  __global__ void FilterV(float* d_result, int width, int heigh
+ 			if(cache_col_start < CACHE_WIDTH - i * FILTERV_BLOCK_HEIGHT)
+ 			{
+ 				int fetch_index = data_index < col ? col : (data_index > data_index_max? data_index_max : data_index);
+-				data[cache_index + i * FILTERV_BLOCK_HEIGHT] = tex1Dfetch(texData,fetch_index);
++				data[cache_index + i * FILTERV_BLOCK_HEIGHT] = tex1Dfetch<float>(texData,fetch_index);
+ 				data_index += IMUL(FILTERV_BLOCK_HEIGHT, width);
+ 			}
+ 		}
+@@ -218,7 +232,7 @@ template<int  FW>  __global__ void FilterV(float* d_result, int width, int heigh
+ }
+ 
+ 
+-template<int LOG_SCALE> __global__ void UpsampleKernel(float* d_result, int width)
++template<int LOG_SCALE> __global__ void UpsampleKernel(cudaTextureObject_t texData, float* d_result, int width)
+ {
+ 	const int SCALE = (1 << LOG_SCALE), SCALE_MASK = (SCALE - 1);
+ 	const float INV_SCALE = 1.0f / (float(SCALE));
+@@ -232,11 +246,11 @@ template<int LOG_SCALE> __global__ void UpsampleKernel(float* d_result, int widt
+ 	int helper = blockIdx.y & SCALE_MASK;
+ 	if (helper)
+ 	{
+-		float v11 = tex1Dfetch(texData, index);
+-		float v12 = tex1Dfetch(texData, index + 1);
++		float v11 = tex1Dfetch<float>(texData, index);
++		float v12 = tex1Dfetch<float>(texData, index + 1);
+ 		index += width;
+-		float v21 = tex1Dfetch(texData, index);
+-		float v22 = tex1Dfetch(texData, index + 1);
++		float v21 = tex1Dfetch<float>(texData, index);
++		float v22 = tex1Dfetch<float>(texData, index + 1);
+ 		float w1 = INV_SCALE * helper, w2 = 1.0 - w1;
+ 		float v1 = (v21 * w1  + w2 * v11);
+ 		float v2 = (v22 * w1  + w2 * v12);
+@@ -250,8 +264,8 @@ template<int LOG_SCALE> __global__ void UpsampleKernel(float* d_result, int widt
+ 		}
+ 	}else
+ 	{
+-		float v1 = tex1Dfetch(texData, index);
+-		float v2 = tex1Dfetch(texData, index + 1);
++		float v1 = tex1Dfetch<float>(texData, index);
++		float v2 = tex1Dfetch<float>(texData, index + 1);
+ 		d_result[dst_idx] = v1;
+ #pragma unroll
+ 		for(int i = 1; i < SCALE; ++i)
+@@ -268,19 +282,19 @@ template<int LOG_SCALE> __global__ void UpsampleKernel(float* d_result, int widt
+ void ProgramCU::SampleImageU(CuTexImage *dst, CuTexImage *src, int log_scale)
+ {
+ 	int width = src->GetImgWidth(), height = src->GetImgHeight();
+-	src->BindTexture(texData);
++	CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
+ 	dim3 grid((width +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height << log_scale);
+ 	dim3 block(FILTERH_TILE_WIDTH);
+ 	switch(log_scale)
+ 	{
+-	case 1 : 	UpsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, width);	break;
+-	case 2 : 	UpsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, width);	break;
+-	case 3 : 	UpsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, width);	break;
++	case 1 : 	UpsampleKernel<1> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, width);	break;
++	case 2 : 	UpsampleKernel<2> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, width);	break;
++	case 3 : 	UpsampleKernel<3> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, width);	break;
+ 	default:	break;
+ 	}
+ }
+ 
+-template<int LOG_SCALE> __global__ void DownsampleKernel(float* d_result, int src_width, int dst_width)
++template<int LOG_SCALE> __global__ void DownsampleKernel(cudaTextureObject_t texData, float* d_result, int src_width, int dst_width)
+ {
+ 	const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
+ 	if(dst_col >= dst_width) return;
+@@ -289,11 +303,11 @@ template<int LOG_SCALE> __global__ void DownsampleKernel(float* d_result, int sr
+ 	const int src_row = blockIdx.y << LOG_SCALE;
+ 	const int src_idx = IMUL(src_row, src_width) + src_col;
+ 	const int dst_idx = IMUL(dst_width, dst_row) + dst_col;
+-	d_result[dst_idx] = tex1Dfetch(texData, src_idx);
++	d_result[dst_idx] = tex1Dfetch<float>(texData, src_idx);
+ 
+ }
+ 
+-__global__ void DownsampleKernel(float* d_result, int src_width, int dst_width, const int log_scale)
++__global__ void DownsampleKernel(cudaTextureObject_t texData, float* d_result, int src_width, int dst_width, const int log_scale)
+ {
+ 	const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
+ 	if(dst_col >= dst_width) return;
+@@ -302,7 +316,7 @@ __global__ void DownsampleKernel(float* d_result, int src_width, int dst_width,
+ 	const int src_row = blockIdx.y << log_scale;
+ 	const int src_idx = IMUL(src_row, src_width) + src_col;
+ 	const int dst_idx = IMUL(dst_width, dst_row) + dst_col;
+-	d_result[dst_idx] = tex1Dfetch(texData, src_idx);
++	d_result[dst_idx] = tex1Dfetch<float>(texData, src_idx);
+ 
+ }
+ 
+@@ -310,28 +324,28 @@ void ProgramCU::SampleImageD(CuTexImage *dst, CuTexImage *src, int log_scale)
+ {
+ 	int src_width = src->GetImgWidth(), dst_width = dst->GetImgWidth() ;
+ 
+-	src->BindTexture(texData);
++	CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
+ 	dim3 grid((dst_width +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, dst->GetImgHeight());
+ 	dim3 block(FILTERH_TILE_WIDTH);
+ 	switch(log_scale)
+ 	{
+-	case 1 : 	DownsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width);	break;
+-	case 2 :	DownsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width);	break;
+-	case 3 : 	DownsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width);	break;
+-	default:	DownsampleKernel    <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width, log_scale);
++	case 1 : 	DownsampleKernel<1> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width);	break;
++	case 2 :	DownsampleKernel<2> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width);	break;
++	case 3 : 	DownsampleKernel<3> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width);	break;
++	default:	DownsampleKernel    <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width, log_scale);
+ 	}
+ }
+ 
+-__global__ void ChannelReduce_Kernel(float* d_result)
++__global__ void ChannelReduce_Kernel(cudaTextureObject_t texData, float* d_result)
+ {
+ 	int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
+-	d_result[index] = tex1Dfetch(texData, index*4);
++	d_result[index] = tex1Dfetch<float>(texData, index*4);
+ }
+ 
+-__global__ void ChannelReduce_Convert_Kernel(float* d_result)
++__global__ void ChannelReduce_Convert_Kernel(cudaTextureObject_t texDataF4, float* d_result)
+ {
+ 	int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
+-	float4 rgba = tex1Dfetch(texDataF4, index);
++	float4 rgba = tex1Dfetch<float4>(texDataF4, index);
+ 	d_result[index] = 0.299f * rgba.x + 0.587f* rgba.y + 0.114f * rgba.z;
+ }
+ 
+@@ -343,19 +357,19 @@ void ProgramCU::ReduceToSingleChannel(CuTexImage* dst, CuTexImage* src, int conv
+ 	dim3 block(FILTERH_TILE_WIDTH);
+ 	if(convert_rgb)
+ 	{
+-		src->BindTexture(texDataF4);
+-		ChannelReduce_Convert_Kernel<<<grid, block>>>((float*)dst->_cuData);
++        CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
++		ChannelReduce_Convert_Kernel<<<grid, block>>>(srcTex.handle, (float*)dst->_cuData);
+ 	}else
+ 	{
+-		src->BindTexture(texData);
+-		ChannelReduce_Kernel<<<grid, block>>>((float*)dst->_cuData);
++		CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
++		ChannelReduce_Kernel<<<grid, block>>>(srcTex.handle, (float*)dst->_cuData);
+ 	}
+ }
+ 
+-__global__ void ConvertByteToFloat_Kernel(float* d_result)
++__global__ void ConvertByteToFloat_Kernel(cudaTextureObject_t texDataB, float* d_result)
+ {
+ 	int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
+-	d_result[index] = tex1Dfetch(texDataB, index);
++	d_result[index] = tex1Dfetch<float>(texDataB, index);
+ }
+ 
+ void ProgramCU::ConvertByteToFloat(CuTexImage*src, CuTexImage* dst)
+@@ -363,8 +377,8 @@ void ProgramCU::ConvertByteToFloat(CuTexImage*src, CuTexImage* dst)
+ 	int width = src->GetImgWidth(), height = dst->GetImgHeight() ;
+ 	dim3 grid((width * height +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH);
+ 	dim3 block(FILTERH_TILE_WIDTH);
+-	src->BindTexture(texDataB);
+-	ConvertByteToFloat_Kernel<<<grid, block>>>((float*)dst->_cuData);
++	CuTexImage::CuTexObj srcTex = src->BindTexture(texDataBDesc, cudaCreateChannelDesc<float>());
++	ConvertByteToFloat_Kernel<<<grid, block>>>(srcTex.handle, (float*)dst->_cuData);
+ }
+ 
+ void ProgramCU::CreateFilterKernel(float sigma, float* kernel, int& width)
+@@ -403,17 +417,17 @@ template<int FW> void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, C
+ 	int width = src->GetImgWidth(), height = src->GetImgHeight();
+ 
+ 	//horizontal filtering
+-	src->BindTexture(texData);
++	CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
+ 	dim3 gridh((width +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height);
+ 	dim3 blockh(FILTERH_TILE_WIDTH);
+-	FilterH<FW><<<gridh, blockh>>>((float*)buf->_cuData, width);
++	FilterH<FW><<<gridh, blockh>>>(srcTex.handle, (float*)buf->_cuData, width);
+ 	CheckErrorCUDA("FilterH");
+ 
+ 	///vertical filtering
+-	buf->BindTexture(texData);
++	CuTexImage::CuTexObj bufTex = buf->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
+ 	dim3 gridv((width + FILTERV_TILE_WIDTH - 1)/ FILTERV_TILE_WIDTH,  (height + FILTERV_TILE_HEIGHT - 1)/FILTERV_TILE_HEIGHT);
+ 	dim3 blockv(FILTERV_TILE_WIDTH, FILTERV_BLOCK_HEIGHT);
+-	FilterV<FW><<<gridv, blockv>>>((float*)dst->_cuData, width, height);
++	FilterV<FW><<<gridv, blockv>>>(bufTex.handle, (float*)dst->_cuData, width, height);
+ 	CheckErrorCUDA("FilterV");
+ }
+ 
+@@ -450,24 +464,20 @@ void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf, f
+ }
+ 
+ 
+-texture<float, 1, cudaReadModeElementType> texC;
+-texture<float, 1, cudaReadModeElementType> texP;
+-texture<float, 1, cudaReadModeElementType> texN;
+-
+-void __global__ ComputeDOG_Kernel(float* d_dog, float2* d_got, int width, int height)
++void __global__ ComputeDOG_Kernel(cudaTextureObject_t texC, cudaTextureObject_t texP, float* d_dog, float2* d_got, int width, int height)
+ {
+ 	int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
+ 	int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
+ 	if(col < width && row < height)
+ 	{
+ 		int index = IMUL(row, width) + col;
+-		float vp = tex1Dfetch(texP, index);
+-		float v = tex1Dfetch(texC, index);
++		float vp = tex1Dfetch<float>(texP, index);
++		float v = tex1Dfetch<float>(texC, index);
+ 		d_dog[index] = v - vp;
+-		float vxn = tex1Dfetch(texC, index + 1);
+-		float vxp = tex1Dfetch(texC, index - 1);
+-		float vyp = tex1Dfetch(texC, index - width);
+-		float vyn = tex1Dfetch(texC, index + width);
++		float vxn = tex1Dfetch<float>(texC, index + 1);
++		float vxp = tex1Dfetch<float>(texC, index - 1);
++		float vyp = tex1Dfetch<float>(texC, index - width);
++		float vyn = tex1Dfetch<float>(texC, index + width);
+ 		float dx = vxn - vxp, dy = vyn - vyp;
+ 		float grd = 0.5f * sqrt(dx * dx  + dy * dy);
+ 		float rot = (grd == 0.0f? 0.0f : atan2(dy, dx));
+@@ -475,15 +485,15 @@ void __global__ ComputeDOG_Kernel(float* d_dog, float2* d_got, int width, int he
+ 	}
+ }
+ 
+-void __global__ ComputeDOG_Kernel(float* d_dog, int width, int height)
++void __global__ ComputeDOG_Kernel(cudaTextureObject_t texC, cudaTextureObject_t texP, float* d_dog, int width, int height)
+ {
+ 	int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
+ 	int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
+ 	if(col < width && row < height)
+ 	{
+ 		int index = IMUL(row, width) + col;
+-		float vp = tex1Dfetch(texP, index);
+-		float v = tex1Dfetch(texC, index);
++		float vp = tex1Dfetch<float>(texP, index);
++		float v = tex1Dfetch<float>(texC, index);
+ 		d_dog[index] = v - vp;
+ 	}
+ }
+@@ -493,19 +503,19 @@ void ProgramCU::ComputeDOG(CuTexImage* gus, CuTexImage* dog, CuTexImage* got)
+ 	int width = gus->GetImgWidth(), height = gus->GetImgHeight();
+ 	dim3 grid((width + DOG_BLOCK_DIMX - 1)/ DOG_BLOCK_DIMX,  (height + DOG_BLOCK_DIMY - 1)/DOG_BLOCK_DIMY);
+ 	dim3 block(DOG_BLOCK_DIMX, DOG_BLOCK_DIMY);
+-	gus->BindTexture(texC);
+-	(gus -1)->BindTexture(texP);
++	CuTexImage::CuTexObj texCObj = gus->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
++	CuTexImage::CuTexObj texPObj = (gus-1)->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
+ 	if(got->_cuData)
+-		ComputeDOG_Kernel<<<grid, block>>>((float*) dog->_cuData, (float2*) got->_cuData, width, height);
++		ComputeDOG_Kernel<<<grid, block>>>(texCObj.handle, texPObj.handle, (float*) dog->_cuData, (float2*) got->_cuData, width, height);
+ 	else
+-		ComputeDOG_Kernel<<<grid, block>>>((float*) dog->_cuData, width, height);
++		ComputeDOG_Kernel<<<grid, block>>>(texCObj.handle, texPObj.handle, (float*) dog->_cuData, width, height);
+ }
+ 
+ 
+ #define READ_CMP_DOG_DATA(datai, tex, idx) \
+-		datai[0] = tex1Dfetch(tex, idx - 1);\
+-		datai[1] = tex1Dfetch(tex, idx);\
+-		datai[2] = tex1Dfetch(tex, idx + 1);\
++		datai[0] = tex1Dfetch<float>(tex, idx - 1);\
++		datai[1] = tex1Dfetch<float>(tex, idx);\
++		datai[2] = tex1Dfetch<float>(tex, idx + 1);\
+ 		if(v > nmax)\
+ 		{\
+ 			   nmax = max(nmax, datai[0]);\
+@@ -521,7 +531,7 @@ void ProgramCU::ComputeDOG(CuTexImage* gus, CuTexImage* dog, CuTexImage* got)
+ 		}
+ 
+ 
+-void __global__ ComputeKEY_Kernel(float4* d_key, int width, int colmax, int rowmax,
++void __global__ ComputeKEY_Kernel(cudaTextureObject_t texP, cudaTextureObject_t texC, cudaTextureObject_t texN, float4* d_key, int width, int colmax, int rowmax,
+ 					float dog_threshold0,  float dog_threshold, float edge_threshold, int subpixel_localization)
+ {
+        float data[3][3], v;
+@@ -546,11 +556,11 @@ void __global__ ComputeKEY_Kernel(float4* d_key, int width, int colmax, int rowm
+ #endif
+        {
+ 			in_image = 1;
+-			data[1][1] = v = tex1Dfetch(texC, idx[1]);
++			data[1][1] = v = tex1Dfetch<float>(texC, idx[1]);
+ 			if(fabs(v) <= dog_threshold0) goto key_finish;
+ 
+-			data[1][0] = tex1Dfetch(texC, idx[1] - 1);
+-			data[1][2] = tex1Dfetch(texC, idx[1] + 1);
++			data[1][0] = tex1Dfetch<float>(texC, idx[1] - 1);
++			data[1][2] = tex1Dfetch<float>(texC, idx[1] + 1);
+ 			nmax = max(data[1][0], data[1][2]);
+ 			nmin = min(data[1][0], data[1][2]);
+ 
+@@ -651,18 +661,18 @@ void ProgramCU::ComputeKEY(CuTexImage* dog, CuTexImage* key, float Tdog, float T
+ 	dim3 grid((width + KEY_BLOCK_DIMX - 1)/ KEY_BLOCK_DIMX,  (height + KEY_BLOCK_DIMY - 1)/KEY_BLOCK_DIMY);
+ #endif
+ 	dim3 block(KEY_BLOCK_DIMX, KEY_BLOCK_DIMY);
+-	dogp->BindTexture(texP);
+-	dog ->BindTexture(texC);
+-	dogn->BindTexture(texN);
++	CuTexImage::CuTexObj texPObj = dogp->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
++	CuTexImage::CuTexObj texCObj = dog->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
++	CuTexImage::CuTexObj texNObj = dogn->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
+ 	Tedge = (Tedge+1)*(Tedge+1)/Tedge;
+-	ComputeKEY_Kernel<<<grid, block>>>((float4*) key->_cuData, width,
++	ComputeKEY_Kernel<<<grid, block>>>(texPObj.handle, texCObj.handle, texNObj.handle, (float4*) key->_cuData, width,
+         width -1, height -1, Tdog1, Tdog, Tedge, GlobalUtil::_SubpixelLocalization);
+ 
+ }
+ 
+ 
+ 
+-void __global__ InitHist_Kernel(int4* hist, int ws, int wd, int height)
++void __global__ InitHist_Kernel(cudaTextureObject_t texDataF4, int4* hist, int ws, int wd, int height)
+ {
+        int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
+        int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
+@@ -677,7 +687,7 @@ void __global__ InitHist_Kernel(int4* hist, int ws, int wd, int height)
+ #pragma unroll
+ 				for(int i = 0; i < 4 ; ++i, ++scol)
+ 				{
+-					float4 temp = tex1Dfetch(texDataF4, sidx +i);
++					float4 temp = tex1Dfetch<float4>(texDataF4, sidx +i);
+ 					v[i] = (scol < ws -1 && scol > 0 && temp.x!=0) ? 1 : 0;
+ 				}
+ 			}
+@@ -694,13 +704,13 @@ void ProgramCU::InitHistogram(CuTexImage* key, CuTexImage* hist)
+ 	int wd = hist->GetImgWidth(), hd = hist->GetImgHeight();
+ 	dim3 grid((wd  + HIST_INIT_WIDTH - 1)/ HIST_INIT_WIDTH,  hd);
+ 	dim3 block(HIST_INIT_WIDTH, 1);
+-	key->BindTexture(texDataF4);
+-	InitHist_Kernel<<<grid, block>>>((int4*) hist->_cuData, ws, wd, hd);
++    CuTexImage::CuTexObj keyTex = key->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
++	InitHist_Kernel<<<grid, block>>>(keyTex.handle, (int4*) hist->_cuData, ws, wd, hd);
+ }
+ 
+ 
+ 
+-void __global__ ReduceHist_Kernel(int4* d_hist, int ws, int wd, int height)
++void __global__ ReduceHist_Kernel(cudaTextureObject_t texDataI4, int4* d_hist, int ws, int wd, int height)
+ {
+        int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
+        int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
+@@ -713,7 +723,7 @@ void __global__ ReduceHist_Kernel(int4* d_hist, int ws, int wd, int height)
+ #pragma unroll
+ 			for(int i = 0; i < 4 && scol < ws; ++i, ++scol)
+ 			{
+-				int4 temp = tex1Dfetch(texDataI4, sidx + i);
++				int4 temp = tex1Dfetch<int4>(texDataI4, sidx + i);
+ 				v[i] = temp.x + temp.y + temp.z + temp.w;
+ 			}
+ 			d_hist[hidx] = make_int4(v[0], v[1], v[2], v[3]);
+@@ -726,21 +736,21 @@ void ProgramCU::ReduceHistogram(CuTexImage*hist1, CuTexImage* hist2)
+ 	int wd = hist2->GetImgWidth(), hd = hist2->GetImgHeight();
+ 	int temp = (int)floorf(logf(float(wd * 2/ 3)) / logf(2.0f));
+ 	const int wi = min(7, max(temp , 0));
+-	hist1->BindTexture(texDataI4);
++    CuTexImage::CuTexObj hist1Tex = hist1->BindTexture(texDataDesc, cudaCreateChannelDesc<int4>());
+ 
+ 	const int BW = 1 << wi, BH =  1 << (7 - wi);
+ 	dim3 grid((wd  + BW - 1)/ BW,  (hd + BH -1) / BH);
+ 	dim3 block(BW, BH);
+-	ReduceHist_Kernel<<<grid, block>>>((int4*)hist2->_cuData, ws, wd, hd);
++	ReduceHist_Kernel<<<grid, block>>>(hist1Tex.handle, (int4*)hist2->_cuData, ws, wd, hd);
+ }
+ 
+ 
+-void __global__ ListGen_Kernel(int4* d_list, int list_len, int width)
++void __global__ ListGen_Kernel(cudaTextureObject_t texDataList, cudaTextureObject_t texDataI4, int4* d_list, int list_len, int width)
+ {
+ 	int idx1 = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
+-    int4 pos = tex1Dfetch(texDataList, idx1);
++    int4 pos = tex1Dfetch<int4>(texDataList, idx1);
+ 	int idx2 = IMUL(pos.y, width) + pos.x;
+-	int4 temp = tex1Dfetch(texDataI4, idx2);
++	int4 temp = tex1Dfetch<int4>(texDataI4, idx2);
+ 	int  sum1 = temp.x + temp.y;
+ 	int  sum2 = sum1 + temp.z;
+ 	pos.x <<= 2;
+@@ -766,15 +776,18 @@ void __global__ ListGen_Kernel(int4* d_list, int list_len, int width)
+ void ProgramCU::GenerateList(CuTexImage* list, CuTexImage* hist)
+ {
+ 	int len = list->GetImgWidth();
+-	list->BindTexture(texDataList);
+-	hist->BindTexture(texDataI4);
++    CuTexImage::CuTexObj listTex = list->BindTexture(texDataDesc, cudaCreateChannelDesc<int4>());
++    CuTexImage::CuTexObj histTex = hist->BindTexture(texDataDesc, cudaCreateChannelDesc<int4>());
+ 	dim3  grid((len + LISTGEN_BLOCK_DIM -1) /LISTGEN_BLOCK_DIM);
+ 	dim3  block(LISTGEN_BLOCK_DIM);
+-	ListGen_Kernel<<<grid, block>>>((int4*) list->_cuData, len,
++	ListGen_Kernel<<<grid, block>>>(listTex.handle, histTex.handle, (int4*) list->_cuData, len,
+                                   hist->GetImgWidth());
+ }
+ 
+-void __global__ ComputeOrientation_Kernel(float4* d_list,
++void __global__ ComputeOrientation_Kernel(cudaTextureObject_t texDataF2,
++                                          cudaTextureObject_t texDataF4,
++                                          cudaTextureObject_t texDataList,
++                                          float4* d_list,
+ 										  int list_len,
+ 										  int width, int height,
+ 										  float sigma, float sigma_step,
+@@ -791,16 +804,16 @@ void __global__ ComputeOrientation_Kernel(float4* d_list,
+ 	float4 key;
+ 	if(existing_keypoint)
+ 	{
+-		key = tex1Dfetch(texDataF4, idx);
++		key = tex1Dfetch<float4>(texDataF4, idx);
+ 	}else
+ 	{
+-		int4 ikey = tex1Dfetch(texDataList, idx);
++		int4 ikey = tex1Dfetch<int4>(texDataList, idx);
+ 		key.x = ikey.x + 0.5f;
+ 		key.y = ikey.y + 0.5f;
+ 		key.z = sigma;
+ 		if(subpixel || keepsign)
+ 		{
+-			float4 offset = tex1Dfetch(texDataF4, IMUL(width, ikey.y) + ikey.x);
++			float4 offset = tex1Dfetch<float4>(texDataF4, IMUL(width, ikey.y) + ikey.x);
+ 			if(subpixel)
+ 			{
+ 				key.x += offset.y;
+@@ -835,7 +848,7 @@ void __global__ ComputeOrientation_Kernel(float4* d_list,
+ 			float dy = y - key.y;
+ 			float sq_dist  = dx * dx + dy * dy;
+ 			if(sq_dist >= dist_threshold) continue;
+-			float2 got = tex2D(texDataF2, x, y);
++			float2 got = tex2D<float2>(texDataF2, x, y);
+ 			float weight = got.x * exp(sq_dist * factor);
+ 			float fidx = floorf(got.y * ten_degree_per_radius);
+ 			int oidx = fidx;
+@@ -943,21 +956,31 @@ void ProgramCU::ComputeOrientation(CuTexImage* list, CuTexImage* got, CuTexImage
+ 	int len = list->GetImgWidth();
+ 	if(len <= 0) return;
+ 	int width = got->GetImgWidth(), height = got->GetImgHeight();
++    CuTexImage::CuTexObj texObjF4;
++    CuTexImage::CuTexObj texObjList;
+ 	if(existing_keypoint)
+ 	{
+-		list->BindTexture(texDataF4);
++        texObjF4 = list->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
+ 	}else
+ 	{
+-		list->BindTexture(texDataList);
+-		if(GlobalUtil::_SubpixelLocalization) key->BindTexture(texDataF4);
++        texObjList = list->BindTexture(texDataDesc, cudaCreateChannelDesc<int4>());
++		if(GlobalUtil::_SubpixelLocalization)
++        {
++            texObjF4 = key->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
++        }
+ 	}
+-	got->BindTexture2D(texDataF2);
++
++	CuTexImage::CuTexObj gotTex = got->BindTexture2D(texDataDesc, cudaCreateChannelDesc<float2>());
+ 
+ 	const int block_width = len < ORIENTATION_COMPUTE_PER_BLOCK ? 16 : ORIENTATION_COMPUTE_PER_BLOCK;
+ 	dim3 grid((len + block_width -1) / block_width);
+ 	dim3 block(block_width);
+ 
+-	ComputeOrientation_Kernel<<<grid, block>>>((float4*) list->_cuData,
++	ComputeOrientation_Kernel<<<grid, block>>>(
++        gotTex.handle,
++        texObjF4.handle,
++        texObjList.handle,
++        (float4*) list->_cuData,
+ 		len, width, height, sigma, sigma_step,
+ 		GlobalUtil::_OrientationGaussianFactor,
+ 		GlobalUtil::_OrientationGaussianFactor * GlobalUtil::_OrientationWindowFactor,
+@@ -967,14 +990,14 @@ void ProgramCU::ComputeOrientation(CuTexImage* list, CuTexImage* got, CuTexImage
+ 	ProgramCU::CheckErrorCUDA("ComputeOrientation");
+ }
+ 
+-template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptor_Kernel(float4* d_des, int num,
++template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptor_Kernel(cudaTextureObject_t texDataF2, cudaTextureObject_t texDataF4, float4* d_des, int num,
+ 											 int width, int height, float window_factor)
+ {
+ 	const float rpi = 4.0/ 3.14159265358979323846;
+ 	int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
+ 	int fidx = idx >> 4;
+ 	if(fidx >= num) return;
+-	float4 key = tex1Dfetch(texDataF4, fidx);
++	float4 key = tex1Dfetch<float4>(texDataF4, fidx);
+ 	int bidx = idx& 0xf, ix = bidx & 0x3, iy = bidx >> 2;
+ 	float spt = fabs(key.z * window_factor);
+ 	float s, c; __sincosf(key.w, &s, &c);
+@@ -1007,7 +1030,7 @@ template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptor_Kernel(float4
+ 			float nyn = fabs(ny);
+ 			if(nxn < 1.0f && nyn < 1.0f)
+ 			{
+-				float2 cc = tex2D(texDataF2, x, y);
++				float2 cc = tex2D<float2>(texDataF2, x, y);
+ 				float dnx = nx + offsetpt.x;
+ 				float dny = ny + offsetpt.y;
+ 				float ww = exp(-0.125f * (dnx * dnx + dny * dny));
+@@ -1048,14 +1071,14 @@ template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptor_Kernel(float4
+ }
+ 
+ 
+-template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptorRECT_Kernel(float4* d_des, int num,
++template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptorRECT_Kernel(cudaTextureObject_t texDataF2, cudaTextureObject_t texDataF4, float4* d_des, int num,
+ 											 int width, int height, float window_factor)
+ {
+ 	const float rpi = 4.0/ 3.14159265358979323846;
+ 	int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
+ 	int fidx = idx >> 4;
+ 	if(fidx >= num) return;
+-	float4 key = tex1Dfetch(texDataF4, fidx);
++	float4 key = tex1Dfetch<float4>(texDataF4, fidx);
+ 	int bidx = idx& 0xf, ix = bidx & 0x3, iy = bidx >> 2;
+     //float aspect_ratio = key.w / key.z;
+     //float aspect_sq = aspect_ratio * aspect_ratio;
+@@ -1080,7 +1103,7 @@ template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptorRECT_Kernel(fl
+ 			float nyn = fabs(ny);
+ 			if(nxn < 1.0f && nyn < 1.0f)
+ 			{
+-				float2 cc = tex2D(texDataF2, x, y);
++				float2 cc = tex2D<float2>(texDataF2, x, y);
+ 				float wx = 1.0 - nxn;
+ 				float wy = 1.0 - nyn;
+ 				float weight =  wx * wy * cc.x;
+@@ -1117,7 +1140,7 @@ template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptorRECT_Kernel(fl
+ 	d_des[didx+1] = make_float4(des[4], des[5], des[6], des[7]);
+ }
+ 
+-void __global__ NormalizeDescriptor_Kernel(float4* d_des, int num)
++void __global__ NormalizeDescriptor_Kernel(cudaTextureObject_t texDataF4, float4* d_des, int num)
+ {
+ 	float4 temp[32];
+ 	int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
+@@ -1127,7 +1150,7 @@ void __global__ NormalizeDescriptor_Kernel(float4* d_des, int num)
+ #pragma unroll
+ 	for(int i = 0; i < 32; ++i)
+ 	{
+-		temp[i] = tex1Dfetch(texDataF4, sidx +i);
++		temp[i] = tex1Dfetch<float4>(texDataF4, sidx +i);
+ 		norm1 += (temp[i].x * temp[i].x + temp[i].y * temp[i].y +
+ 				 temp[i].z * temp[i].z + temp[i].w * temp[i].w);
+ 	}
+@@ -1161,8 +1184,8 @@ void ProgramCU::ComputeDescriptor(CuTexImage*list, CuTexImage* got, CuTexImage*
+ 	int height = got->GetImgHeight();
+ 
+     dtex->InitTexture(num * 128, 1, 1);
+-	got->BindTexture2D(texDataF2);
+-	list->BindTexture(texDataF4);
++    CuTexImage::CuTexObj gotTex = got->BindTexture2D(texDataDesc, cudaCreateChannelDesc<float2>());
++    CuTexImage::CuTexObj listTex = list->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
+ 	int block_width = DESCRIPTOR_COMPUTE_BLOCK_SIZE;
+ 	dim3 grid((num * 16 + block_width -1) / block_width);
+ 	dim3 block(block_width);
+@@ -1170,24 +1193,24 @@ void ProgramCU::ComputeDescriptor(CuTexImage*list, CuTexImage* got, CuTexImage*
+     if(rect)
+     {
+ 	    if(GlobalUtil::_UseDynamicIndexing)
+-	    	ComputeDescriptorRECT_Kernel<true><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
++	    	ComputeDescriptorRECT_Kernel<true><<<grid, block>>>(gotTex.handle, listTex.handle, (float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
+ 	    else
+-	    	ComputeDescriptorRECT_Kernel<false><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
++	    	ComputeDescriptorRECT_Kernel<false><<<grid, block>>>(gotTex.handle, listTex.handle, (float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
+ 
+     }else
+     {
+ 	    if(GlobalUtil::_UseDynamicIndexing)
+-	    	ComputeDescriptor_Kernel<true><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
++	    	ComputeDescriptor_Kernel<true><<<grid, block>>>(gotTex.handle, listTex.handle, (float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
+ 	    else
+-	    	ComputeDescriptor_Kernel<false><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
++	    	ComputeDescriptor_Kernel<false><<<grid, block>>>(gotTex.handle, listTex.handle, (float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
+     }
+ 	if(GlobalUtil::_NormalizedSIFT)
+ 	{
+-		dtex->BindTexture(texDataF4);
++        CuTexImage::CuTexObj dtexTex = dtex->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
+ 		const int block_width = DESCRIPTOR_NORMALIZ_PER_BLOCK;
+ 		dim3 grid((num + block_width -1) / block_width);
+ 		dim3 block(block_width);
+-		NormalizeDescriptor_Kernel<<<grid, block>>>((float4*) dtex->_cuData, num);
++		NormalizeDescriptor_Kernel<<<grid, block>>>(dtexTex.handle, (float4*) dtex->_cuData, num);
+ 	}
+ 	CheckErrorCUDA("ComputeDescriptor");
+ }
+@@ -1213,14 +1236,14 @@ int ProgramCU::CheckErrorCUDA(const char* location)
+     }
+ }
+ 
+-void __global__ ConvertDOG_Kernel(float* d_result, int width, int height)
++void __global__ ConvertDOG_Kernel(cudaTextureObject_t texData, float* d_result, int width, int height)
+ {
+ 	int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
+ 	int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
+ 	if(col < width && row < height)
+ 	{
+ 		int index = row * width  + col;
+-		float v = tex1Dfetch(texData, index);
++		float v = tex1Dfetch<float>(texData, index);
+ 		d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)?
+ 			0.5 : saturate(0.5+20.0*v);
+ 	}
+@@ -1230,21 +1253,21 @@ void ProgramCU::DisplayConvertDOG(CuTexImage* dog, CuTexImage* out)
+ {
+ 	if(out->_cuData == NULL) return;
+ 	int width = dog->GetImgWidth(), height = dog ->GetImgHeight();
+-	dog->BindTexture(texData);
++	CuTexImage::CuTexObj dogTex = dog->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
+ 	dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM,  (height + BLOCK_DIM - 1)/BLOCK_DIM);
+ 	dim3 block(BLOCK_DIM, BLOCK_DIM);
+-	ConvertDOG_Kernel<<<grid, block>>>((float*) out->_cuData, width, height);
++	ConvertDOG_Kernel<<<grid, block>>>(dogTex.handle, (float*) out->_cuData, width, height);
+ 	ProgramCU::CheckErrorCUDA("DisplayConvertDOG");
+ }
+ 
+-void __global__ ConvertGRD_Kernel(float* d_result, int width, int height)
++void __global__ ConvertGRD_Kernel(cudaTextureObject_t texData, float* d_result, int width, int height)
+ {
+ 	int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
+ 	int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
+ 	if(col < width && row < height)
+ 	{
+ 		int index = row * width  + col;
+-		float v = tex1Dfetch(texData, index << 1);
++		float v = tex1Dfetch<float>(texData, index << 1);
+ 		d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)?
+ 				0 : saturate(5 * v);
+ 
+@@ -1256,14 +1279,14 @@ void ProgramCU::DisplayConvertGRD(CuTexImage* got, CuTexImage* out)
+ {
+ 	if(out->_cuData == NULL) return;
+ 	int width = got->GetImgWidth(), height = got ->GetImgHeight();
+-	got->BindTexture(texData);
++	CuTexImage::CuTexObj gotTex = got->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
+ 	dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM,  (height + BLOCK_DIM - 1)/BLOCK_DIM);
+ 	dim3 block(BLOCK_DIM, BLOCK_DIM);
+-	ConvertGRD_Kernel<<<grid, block>>>((float*) out->_cuData, width, height);
++	ConvertGRD_Kernel<<<grid, block>>>(gotTex.handle, (float*) out->_cuData, width, height);
+ 	ProgramCU::CheckErrorCUDA("DisplayConvertGRD");
+ }
+ 
+-void __global__ ConvertKEY_Kernel(float4* d_result, int width, int height)
++void __global__ ConvertKEY_Kernel(cudaTextureObject_t texData, cudaTextureObject_t texDataF4, float4* d_result, int width, int height)
+ {
+ 
+ 	int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
+@@ -1271,10 +1294,10 @@ void __global__ ConvertKEY_Kernel(float4* d_result, int width, int height)
+ 	if(col < width && row < height)
+ 	{
+ 		int index = row * width + col;
+-		float4 keyv = tex1Dfetch(texDataF4, index);
++		float4 keyv = tex1Dfetch<float4>(texDataF4, index);
+ 		int is_key = (keyv.x == 1.0f || keyv.x == -1.0f);
+ 		int inside = col > 0 && row > 0 && row < height -1 && col < width - 1;
+-		float v = inside? saturate(0.5 + 20 * tex1Dfetch(texData, index)) : 0.5;
++		float v = inside? saturate(0.5 + 20 * tex1Dfetch<float>(texData, index)) : 0.5;
+ 		d_result[index] = is_key && inside ?
+ 			(keyv.x > 0? make_float4(1.0f, 0, 0, 1.0f) : make_float4(0.0f, 1.0f, 0.0f, 1.0f)):
+ 			make_float4(v, v, v, 1.0f) ;
+@@ -1284,19 +1307,19 @@ void ProgramCU::DisplayConvertKEY(CuTexImage* key, CuTexImage* dog, CuTexImage*
+ {
+ 	if(out->_cuData == NULL) return;
+ 	int width = key->GetImgWidth(), height = key ->GetImgHeight();
+-	dog->BindTexture(texData);
+-	key->BindTexture(texDataF4);
++	CuTexImage::CuTexObj dogTex = dog->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
++    CuTexImage::CuTexObj keyTex = key->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
+ 	dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM,  (height + BLOCK_DIM - 1)/BLOCK_DIM);
+ 	dim3 block(BLOCK_DIM, BLOCK_DIM);
+-	ConvertKEY_Kernel<<<grid, block>>>((float4*) out->_cuData, width, height);
++	ConvertKEY_Kernel<<<grid, block>>>(dogTex.handle, keyTex.handle, (float4*) out->_cuData, width, height);
+ }
+ 
+ 
+-void __global__ DisplayKeyPoint_Kernel(float4 * d_result, int num)
++void __global__ DisplayKeyPoint_Kernel(cudaTextureObject_t texDataF4, float4 * d_result, int num)
+ {
+ 	int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
+ 	if(idx >= num) return;
+-	float4 v = tex1Dfetch(texDataF4, idx);
++	float4 v = tex1Dfetch<float4>(texDataF4, idx);
+ 	d_result[idx] = make_float4(v.x, v.y, 0, 1.0f);
+ }
+ 
+@@ -1306,17 +1329,17 @@ void ProgramCU::DisplayKeyPoint(CuTexImage* ftex, CuTexImage* out)
+ 	int block_width = 64;
+ 	dim3 grid((num + block_width -1) /block_width);
+ 	dim3 block(block_width);
+-	ftex->BindTexture(texDataF4);
+-	DisplayKeyPoint_Kernel<<<grid, block>>>((float4*) out->_cuData, num);
++    CuTexImage::CuTexObj ftexTex = ftex->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
++	DisplayKeyPoint_Kernel<<<grid, block>>>(ftexTex.handle, (float4*) out->_cuData, num);
+ 	ProgramCU::CheckErrorCUDA("DisplayKeyPoint");
+ }
+ 
+-void __global__ DisplayKeyBox_Kernel(float4* d_result, int num)
++void __global__ DisplayKeyBox_Kernel(cudaTextureObject_t texDataF4, float4* d_result, int num)
+ {
+ 	int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
+ 	if(idx >= num) return;
+ 	int  kidx = idx / 10, vidx = idx - IMUL(kidx , 10);
+-	float4 v = tex1Dfetch(texDataF4, kidx);
++	float4 v = tex1Dfetch<float4>(texDataF4, kidx);
+ 	float sz = fabs(v.z * 3.0f);
+ 	///////////////////////
+ 	float s, c;	__sincosf(v.w, &s, &c);
+@@ -1336,24 +1359,8 @@ void ProgramCU::DisplayKeyBox(CuTexImage* ftex, CuTexImage* out)
+ 	int block_width = 32;
+ 	dim3 grid((len * 10 + block_width -1) / block_width);
+ 	dim3 block(block_width);
+-	ftex->BindTexture(texDataF4);
+-	DisplayKeyBox_Kernel<<<grid, block>>>((float4*) out->_cuData, len * 10);
+-}
+-///////////////////////////////////////////////////////////////////
+-inline void CuTexImage:: BindTexture(textureReference& texRef)
+-{
+-	 cudaBindTexture(NULL, &texRef, _cuData, &texRef.channelDesc, _numBytes);
+-}
+-
+-inline void CuTexImage::BindTexture2D(textureReference& texRef)
+-{
+-#if defined(SIFTGPU_ENABLE_LINEAR_TEX2D)
+-	cudaBindTexture2D(0, &texRef, _cuData, &texRef.channelDesc, _imgWidth, _imgHeight, _imgWidth* _numChannel* sizeof(float));
+-#else
+-	cudaChannelFormatDesc desc;
+-	cudaGetChannelDesc(&desc, _cuData2D);
+-	cudaBindTextureToArray(&texRef, _cuData2D, &desc);
+-#endif
++    CuTexImage::CuTexObj ftexTex = ftex->BindTexture(texDataDesc, cudaCreateChannelDesc<float4>());
++	DisplayKeyBox_Kernel<<<grid, block>>>(ftexTex.handle, (float4*) out->_cuData, len * 10);
+ }
+ 
+ int ProgramCU::CheckCudaDevice(int device)
+@@ -1401,11 +1408,7 @@ int ProgramCU::CheckCudaDevice(int device)
+ #define MULT_BLOCK_DIMX (MULT_TBLOCK_DIMX)
+ #define MULT_BLOCK_DIMY (8 * MULT_TBLOCK_DIMY)
+ 
+-
+-texture<uint4, 1, cudaReadModeElementType> texDes1;
+-texture<uint4, 1, cudaReadModeElementType> texDes2;
+-
+-void __global__ MultiplyDescriptor_Kernel(int* d_result, int num1, int num2, int3* d_temp)
++void __global__ MultiplyDescriptor_Kernel(cudaTextureObject_t texDes1, cudaTextureObject_t texDes2, int* d_result, int num1, int num2, int3* d_temp)
+ {
+ 	int idx01 = (blockIdx.y  * MULT_BLOCK_DIMY),  idx02 = (blockIdx.x  * MULT_BLOCK_DIMX);
+ 
+@@ -1419,13 +1422,13 @@ void __global__ MultiplyDescriptor_Kernel(int* d_result, int num1, int num2, int
+ 	//Load feature descriptors
+ 	///////////////////////////////////////////////////////////////
+ #if MULT_BLOCK_DIMY == 16
+-	uint4 v = tex1Dfetch(texDes1, read_idx1);
++	uint4 v = tex1Dfetch<uint4>(texDes1, read_idx1);
+ 	data1[cache_idx1]   = v.x;	data1[cache_idx1+1] = v.y;
+ 	data1[cache_idx1+2] = v.z;	data1[cache_idx1+3] = v.w;
+ #elif MULT_BLOCK_DIMY == 8
+ 	if(threadIdx.x < 64)
+ 	{
+-		uint4 v = tex1Dfetch(texDes1, read_idx1);
++		uint4 v = tex1Dfetch<uint4>(texDes1, read_idx1);
+ 		data1[cache_idx1]   = v.x;		data1[cache_idx1+1] = v.y;
+ 		data1[cache_idx1+2] = v.z;		data1[cache_idx1+3] = v.w;
+ 	}
+@@ -1446,7 +1449,7 @@ void __global__ MultiplyDescriptor_Kernel(int* d_result, int num1, int num2, int
+ #pragma unroll
+ 	for(int i = 0; i < 8; ++i)
+ 	{
+-		uint4 v = tex1Dfetch(texDes2, read_idx2 + i);
++		uint4 v = tex1Dfetch<uint4>(texDes2, read_idx2 + i);
+ 		unsigned char* p2 = (unsigned char*)(&v);
+ #pragma unroll
+ 		for(int k = 0; k < MULT_BLOCK_DIMY; ++k)
+@@ -1501,20 +1504,23 @@ void ProgramCU::MultiplyDescriptor(CuTexImage* des1, CuTexImage* des2, CuTexImag
+ 	dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY);
+ 	texDot->InitTexture( num2,num1);
+ 	if(texCRT) texCRT->InitTexture(num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 32);
+-	des1->BindTexture(texDes1);
+-	des2->BindTexture(texDes2);
++	CuTexImage::CuTexObj des1Tex = des1->BindTexture(texDataDesc, cudaCreateChannelDesc<uint4>());
++	CuTexImage::CuTexObj des2Tex = des2->BindTexture(texDataDesc, cudaCreateChannelDesc<uint4>());
+ 
+-	MultiplyDescriptor_Kernel<<<grid, block>>>((int*)texDot->_cuData, num1, num2,
++	MultiplyDescriptor_Kernel<<<grid, block>>>(des1Tex.handle, des2Tex.handle, (int*)texDot->_cuData, num1, num2,
+ 												(texCRT? (int3*)texCRT->_cuData : NULL));
+ }
+ 
+-texture<float, 1, cudaReadModeElementType> texLoc1;
+-texture<float2, 1, cudaReadModeElementType> texLoc2;
+-struct Matrix33{float mat[3][3];};
++struct Matrix33
++{
++	float mat[3][3];
++};
+ 
+ 
+ 
+-void __global__ MultiplyDescriptorG_Kernel(int* d_result, int num1, int num2, int3* d_temp,
++void __global__ MultiplyDescriptorG_Kernel(cudaTextureObject_t texDes1, cudaTextureObject_t texDes2,
++										   cudaTextureObject_t texLoc1, cudaTextureObject_t texLoc2, 
++										   int* d_result, int num1, int num2, int3* d_temp,
+ 										   Matrix33 H, float hdistmax, Matrix33 F, float fdistmax)
+ {
+ 	int idx01 = (blockIdx.y  * MULT_BLOCK_DIMY);
+@@ -1529,7 +1535,7 @@ void __global__ MultiplyDescriptorG_Kernel(int* d_result, int num1, int num2, in
+ 	int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2;
+ 	int cache_idx1 = IMUL(row4, 17) + (col4 << 2);
+ #if MULT_BLOCK_DIMY == 16
+-	uint4 v = tex1Dfetch(texDes1, read_idx1);
++	uint4 v = tex1Dfetch<uint4>(texDes1, read_idx1);
+ 	data1[cache_idx1]   = v.x;
+ 	data1[cache_idx1+1] = v.y;
+ 	data1[cache_idx1+2] = v.z;
+@@ -1537,7 +1543,7 @@ void __global__ MultiplyDescriptorG_Kernel(int* d_result, int num1, int num2, in
+ #elif MULT_BLOCK_DIMY == 8
+ 	if(threadIdx.x < 64)
+ 	{
+-		uint4 v = tex1Dfetch(texDes1, read_idx1);
++		uint4 v = tex1Dfetch<uint4>(texDes1, read_idx1);
+ 		data1[cache_idx1]   = v.x;
+ 		data1[cache_idx1+1] = v.y;
+ 		data1[cache_idx1+2] = v.z;
+@@ -1549,7 +1555,7 @@ void __global__ MultiplyDescriptorG_Kernel(int* d_result, int num1, int num2, in
+ 	__syncthreads();
+ 	if(threadIdx.x < MULT_BLOCK_DIMY * 2)
+ 	{
+-		loc1[threadIdx.x] = tex1Dfetch(texLoc1, 2 * idx01 + threadIdx.x);
++		loc1[threadIdx.x] = tex1Dfetch<float>(texLoc1, 2 * idx01 + threadIdx.x);
+ 	}
+ 	__syncthreads();
+ 	if(idx2 >= num2) return;
+@@ -1558,7 +1564,7 @@ void __global__ MultiplyDescriptorG_Kernel(int* d_result, int num1, int num2, in
+ 	//geometric verification
+ 	/////////////////////////////////////////////////////////////////////////////////////////////
+ 	int good_count = 0;
+-	float2 loc2 = tex1Dfetch(texLoc2, idx2);
++	float2 loc2 = tex1Dfetch<float2>(texLoc2, idx2);
+ #pragma unroll
+ 	for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
+ 	{
+@@ -1608,7 +1614,7 @@ void __global__ MultiplyDescriptorG_Kernel(int* d_result, int num1, int num2, in
+ #pragma unroll
+ 		for(int i = 0; i < 8; ++i)
+ 		{
+-			uint4 v = tex1Dfetch(texDes2, read_idx2 + i);
++			uint4 v = tex1Dfetch<uint4>(texDes2, read_idx2 + i);
+ 			unsigned char* p2 = (unsigned char*)(&v);
+ #pragma unroll
+ 			for(int k = 0; k < MULT_BLOCK_DIMY; ++k)
+@@ -1674,11 +1680,12 @@ void ProgramCU::MultiplyDescriptorG(CuTexImage* des1, CuTexImage* des2,
+ 	//intermediate results
+ 	texDot->InitTexture( num2,num1);
+ 	if(texCRT) texCRT->InitTexture( num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 3);
+-	loc1->BindTexture(texLoc1);
+-	loc2->BindTexture(texLoc2);
+-	des1->BindTexture(texDes1);
+-	des2->BindTexture(texDes2);
+-	MultiplyDescriptorG_Kernel<<<grid, block>>>((int*)texDot->_cuData, num1, num2,
++	CuTexImage::CuTexObj loc1Tex = loc1->BindTexture(texDataDesc, cudaCreateChannelDesc<float>());
++	CuTexImage::CuTexObj loc2Tex = loc2->BindTexture(texDataDesc, cudaCreateChannelDesc<float2>());
++	CuTexImage::CuTexObj des1Tex = des1->BindTexture(texDataDesc, cudaCreateChannelDesc<uint4>());
++	CuTexImage::CuTexObj des2Tex = des2->BindTexture(texDataDesc, cudaCreateChannelDesc<uint4>());
++	MultiplyDescriptorG_Kernel<<<grid, block>>>(des1Tex.handle, des2Tex.handle, loc1Tex.handle, loc2Tex.handle,
++												(int*)texDot->_cuData, num1, num2,
+ 												(texCRT? (int3*)texCRT->_cuData : NULL),
+ 												MatH, hdistmax, MatF, fdistmax);
+ }
+diff --git a/lib/SiftGPU/PyramidCU.cpp b/lib/SiftGPU/PyramidCU.cpp
+index ea6711931..074b442da 100644
+--- a/lib/SiftGPU/PyramidCU.cpp
++++ b/lib/SiftGPU/PyramidCU.cpp
+@@ -237,7 +237,6 @@ void PyramidCU::ResizePyramid(int w, int h)
+ 			if(	j >= 1 && j < 1 + param._dog_level_num)
+ 			{
+ 				got->InitTexture(wa, h, 2); //2 * nlev - 6
+-				got->InitTexture2D();
+ 			}
+ 			if(j > 1 && j < nlev -1)	key->InitTexture(wa, h, 4); // nlev -3 ; 4 * nlev - 12
+ 		}
+@@ -296,7 +295,6 @@ void PyramidCU::FitPyramid(int w, int h)
+ 			if(	j >= 1 && j < 1 + param._dog_level_num)
+ 			{
+ 				got->InitTexture(wa, h, 2); //2 * nlev - 6
+-				got->InitTexture2D();
+ 			}
+ 			if(j > 1 && j < nlev -1)	key->InitTexture(wa, h, 4); // nlev -3 ; 4 * nlev - 12
+ 		}
+@@ -1084,7 +1082,7 @@ void PyramidCU::CopyGradientTex()
+ 		//compute the gradient
+ 		for(int j = 0; j <  param._dog_level_num ; j++, got++, idx++)
+ 		{
+-			if(_levelFeatureNum[idx] > 0)	got->CopyToTexture2D();
++			// if(_levelFeatureNum[idx] > 0)	got->CopyToTexture2D();
+ 		}
+ 	}
+ 	if(GlobalUtil::_timingS)
diff --git a/recipe/1840.patch b/recipe/1840.patch
new file mode 100644
index 0000000..6629898
--- /dev/null
+++ b/recipe/1840.patch
@@ -0,0 +1,14723 @@
+From c8b6656fe83810e5edc059cb0b0c75528b905dab Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Johannes=20Sch=C3=B6nberger?= <joschonb@microsoft.com>
+Date: Sun, 12 Mar 2023 15:43:14 +0100
+Subject: [PATCH 1/2] Remove PBA as bundle adjustment backend to support CUDA
+ 12+
+
+---
+ CMakeLists.txt                          |    5 +-
+ cmake/CMakeConfig.cmake.in              |    1 -
+ doc/bibliography.rst                    |    4 -
+ doc/tutorial.rst                        |    6 +-
+ lib/CMakeLists.txt                      |    1 -
+ lib/PBA/CMakeLists.txt                  |   41 -
+ lib/PBA/ConfigBA.cpp                    |  589 ---
+ lib/PBA/ConfigBA.h                      |  226 --
+ lib/PBA/CuTexImage.cpp                  |  136 -
+ lib/PBA/CuTexImage.h                    |   83 -
+ lib/PBA/DataInterface.h                 |  423 ---
+ lib/PBA/LICENSE                         |  674 ----
+ lib/PBA/ProgramCU.cu                    | 3637 -------------------
+ lib/PBA/ProgramCU.h                     |  127 -
+ lib/PBA/SparseBundleCPU.cpp             | 4369 -----------------------
+ lib/PBA/SparseBundleCPU.h               |  286 --
+ lib/PBA/SparseBundleCU.cpp              | 1989 -----------
+ lib/PBA/SparseBundleCU.h                |  176 -
+ lib/PBA/pba.cpp                         |  134 -
+ lib/PBA/pba.h                           |  156 -
+ lib/PBA/util.h                          |  753 ----
+ src/controllers/incremental_mapper.cc   |   22 +-
+ src/controllers/incremental_mapper.h    |    7 -
+ src/optim/bundle_adjustment.cc          |  253 --
+ src/optim/bundle_adjustment.h           |   66 -
+ src/optim/bundle_adjustment_test.cc     |  108 -
+ src/sfm/incremental_mapper.cc           |   33 -
+ src/sfm/incremental_mapper.h            |    5 +-
+ src/ui/license_widget.cc                |   19 -
+ src/ui/license_widget.h                 |    1 -
+ src/ui/reconstruction_options_widget.cc |    3 -
+ src/util/option_manager.cc              |    4 -
+ 32 files changed, 4 insertions(+), 14333 deletions(-)
+ delete mode 100644 lib/PBA/CMakeLists.txt
+ delete mode 100644 lib/PBA/ConfigBA.cpp
+ delete mode 100644 lib/PBA/ConfigBA.h
+ delete mode 100644 lib/PBA/CuTexImage.cpp
+ delete mode 100644 lib/PBA/CuTexImage.h
+ delete mode 100644 lib/PBA/DataInterface.h
+ delete mode 100755 lib/PBA/LICENSE
+ delete mode 100644 lib/PBA/ProgramCU.cu
+ delete mode 100644 lib/PBA/ProgramCU.h
+ delete mode 100644 lib/PBA/SparseBundleCPU.cpp
+ delete mode 100644 lib/PBA/SparseBundleCPU.h
+ delete mode 100644 lib/PBA/SparseBundleCU.cpp
+ delete mode 100644 lib/PBA/SparseBundleCU.h
+ delete mode 100644 lib/PBA/pba.cpp
+ delete mode 100644 lib/PBA/pba.h
+ delete mode 100644 lib/PBA/util.h
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 2a9724e15..e4d6e436b 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -264,7 +264,7 @@ if(CUDA_ENABLED AND CUDA_FOUND)
+     # Do not show warnings if the architectures are deprecated.
+     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
+     # Do not show warnings if cuda library functions are deprecated.
+-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-declarations")
++    # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-declarations")
+     # Explicitly set PIC flags for CUDA targets.
+     if(NOT IS_MSVC)
+         set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options -fPIC")
+@@ -404,7 +404,6 @@ endif()
+ 
+ set(COLMAP_INTERNAL_LIBRARIES
+     lsd
+-    pba
+     poisson_recon
+     sift_gpu
+     # vlfeat
+@@ -427,7 +426,6 @@ add_subdirectory(src)
+ ################################################################################
+ 
+ COLMAP_ADD_SOURCE_DIR(lib/LSD LIB_LSD_SRCS *.h *.c)
+-COLMAP_ADD_SOURCE_DIR(lib/PBA LIB_PBA_SRCS *.h *.cpp *.cu)
+ COLMAP_ADD_SOURCE_DIR(lib/PoissonRecon LIB_POISSON_RECON_SRCS *.h *.cpp *.inl)
+ COLMAP_ADD_SOURCE_DIR(lib/SiftGPU LIB_SIFT_GPU_SRCS *.h *.cpp *.cu)
+ # COLMAP_ADD_SOURCE_DIR(lib/VLFeat LIB_VLFEAT_SRCS *.h *.c *.tc)
+@@ -451,7 +449,6 @@ COLMAP_ADD_SOURCE_DIR(src/util UTIL_SRCS *.h *.cc)
+ add_library(
+     ${COLMAP_SRC_ROOT_FOLDER}
+     ${LIB_LSD_SRCS}
+-    ${LIB_PBA_SRCS}
+     ${LIB_POISSON_RECON_SRCS}
+     ${LIB_SIFT_GPU_SRCS}
+     # ${LIB_VLFEAT_SRCS}
+diff --git a/cmake/CMakeConfig.cmake.in b/cmake/CMakeConfig.cmake.in
+index d6133f027..755fcab32 100644
+--- a/cmake/CMakeConfig.cmake.in
++++ b/cmake/CMakeConfig.cmake.in
+@@ -167,7 +167,6 @@ set(COLMAP_LINK_DIRS
+ 
+ set(COLMAP_INTERNAL_LIBRARIES
+     lsd
+-    pba
+     poisson_recon
+     sqlite3
+     sift_gpu
+diff --git a/doc/bibliography.rst b/doc/bibliography.rst
+index 4845bc83d..1922adb48 100755
+--- a/doc/bibliography.rst
++++ b/doc/bibliography.rst
+@@ -40,9 +40,5 @@ Bibliography
+ .. [lowe04] Lowe, David G. "Distinctive image features from scale-invariant
+     keypoints". International journal of computer vision 60.2 (2004): 91-110.
+ 
+-.. [wu11] Wu, Changchang, Sameer Agarwal, Brian Curless,
+-    and Steven M. Seitz. "Multicore bundle adjustment."
+-    Conference on Computer Vision and Pattern Recognition, 2011.
+-
+ .. [wu13] Wu, Changchang. "Towards linear-time incremental structure from
+     motion." International Conference 3D Vision, 2013.
+diff --git a/doc/tutorial.rst b/doc/tutorial.rst
+index cdf6701e1..fcd071523 100755
+--- a/doc/tutorial.rst
++++ b/doc/tutorial.rst
+@@ -384,11 +384,7 @@ available controls. COLMAP attempts to reconstruct multiple models if not all
+ images are registered into the same model. The different models can be selected
+ from the drop-down menu in the toolbar. If the different models have common
+ registered images, you can use the ``model_converter`` executable to merge them
+-into a single reconstruction (see :ref:`FAQ <faq-merge-models>` for details). If
+-all your images use the `SIMPLE_RADIAL` camera model (default) without shared
+-intrinsics, you can use PBA [wu11]_ instead of Ceres Solver [ceres]_ for fast
+-bundle adjustment, which can be activated in the reconstruction options under
+-the bundle adjustment section (`use_pba=true`).
++into a single reconstruction (see :ref:`FAQ <faq-merge-models>` for details).
+ 
+ Ideally, the reconstruction works fine and all images are registered. If this is
+ not the case, it is recommended to:
+diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
+index a6c26e7..a61e438 100644
+--- a/lib/CMakeLists.txt
++++ b/lib/CMakeLists.txt
+@@ -36,7 +36,6 @@ elseif(IS_GNU OR IS_CLANG)
+ endif()
+ 
+ add_subdirectory(LSD)
+-add_subdirectory(PBA)
+ add_subdirectory(PoissonRecon)
+ add_subdirectory(SiftGPU)
+ # add_subdirectory(VLFeat)
+diff --git a/lib/PBA/CMakeLists.txt b/lib/PBA/CMakeLists.txt
+deleted file mode 100644
+index 2473436e6..000000000
+--- a/lib/PBA/CMakeLists.txt
++++ /dev/null
+@@ -1,41 +0,0 @@
+-if(NOT IS_MSVC)
+-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
+-endif()
+-
+-if(NOT SIMD_ENABLED)
+-    add_definitions("-DDISABLE_CPU_NEON")
+-    add_definitions("-DDISABLE_CPU_AVX")
+-    add_definitions("-DDISABLE_CPU_SSE")
+-endif()
+-
+-if(CUDA_ENABLED)
+-    COLMAP_ADD_CUDA_LIBRARY(pba
+-        ConfigBA.cpp
+-        ConfigBA.h
+-        CuTexImage.cpp
+-        CuTexImage.h
+-        DataInterface.h
+-        pba.cpp
+-        pba.h
+-        ProgramCU.cu
+-        ProgramCU.h
+-        SparseBundleCPU.cpp
+-        SparseBundleCPU.h
+-        SparseBundleCU.cpp
+-        SparseBundleCU.h
+-        util.h
+-    )
+-else()
+-    add_definitions("-DPBA_NO_GPU")
+-
+-    COLMAP_ADD_LIBRARY(pba
+-        ConfigBA.cpp
+-        ConfigBA.h
+-        DataInterface.h
+-        pba.cpp
+-        pba.h
+-        SparseBundleCPU.cpp
+-        SparseBundleCPU.h
+-        util.h
+-    )
+-endif()
+diff --git a/lib/PBA/ConfigBA.cpp b/lib/PBA/ConfigBA.cpp
+deleted file mode 100644
+index f59209477..000000000
+--- a/lib/PBA/ConfigBA.cpp
++++ /dev/null
+@@ -1,589 +0,0 @@
+-////////////////////////////////////////////////////////////////////////////
+-//  File:           ConfigBA.cpp
+-//  Author:         Changchang Wu
+-//  Description :   implementation of the configuration object class
+-//
+-//  Copyright (c) 2011  Changchang Wu (ccwu@cs.washington.edu)
+-//    and the University of Washington at Seattle
+-//
+-//  This library is free software; you can redistribute it and/or
+-//  modify it under the terms of the GNU General Public
+-//  License as published by the Free Software Foundation; either
+-//  Version 3 of the License, or (at your option) any later version.
+-//
+-//  This library is distributed in the hope that it will be useful,
+-//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+-//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-//  General Public License for more details.
+-//
+-////////////////////////////////////////////////////////////////////////////////
+-
+-#include <string.h>
+-#include <iostream>
+-#include <algorithm>
+-#include <time.h>
+-#include <iomanip>
+-#include <fstream>
+-#include <string>
+-
+-using std::cout;
+-using std::ofstream;
+-using std::string;
+-
+-#ifndef _WIN32
+-#include <sys/time.h>
+-#endif
+-
+-#include "ConfigBA.h"
+-
+-#ifdef _MSC_VER
+-#define strcpy strcpy_s
+-#define sprintf sprintf_s
+-#endif
+-
+-namespace pba {
+-
+-ConfigBA::ConfigBA() {
+-  __lm_max_iteration = 50;
+-  __lm_initial_damp = 1e-3f;
+-  __lm_minimum_damp = 1e-10f;
+-  __lm_maximum_damp = 1e+5f;
+-  __lm_delta_threshold = 1e-6f;
+-  __lm_gradient_threshold = 1e-10f;
+-  __lm_mse_threshold = 0.25f;
+-  __lm_use_diagonal_damp = true;
+-  __lm_check_gradient = false;
+-  __lm_damping_auto_switch = 0;
+-  __bundle_time_budget = 0;
+-  __bundle_mode_next = 0;
+-  __bundle_current_mode = 0;
+-
+-  ////////////////////////////
+-  __cg_max_iteration = 100;
+-  __cg_min_iteration = 10;
+-  __cg_recalculate_freq = 0;
+-  __cg_norm_threshold = 0.1f;
+-  __cg_norm_guard = 1.0f;
+-  __pba_experimental = 0;
+-  __cg_schur_complement = 0;
+-
+-  ////////////////////////////
+-  __fixed_intrinsics = false;
+-  __use_radial_distortion = 0;
+-  __reset_initial_distortion = false;
+-
+-  //////////////////////////////
+-  __verbose_level = 2;
+-  __verbose_cg_iteration = false;
+-  __verbose_function_time = false;
+-  __verbose_allocation = false;
+-  __verbose_sse = false;
+-  __save_gradient_norm = false;
+-  __stat_filename = NULL;
+-  __matlab_format_stat = true;
+-
+-  /////////////////////////////
+-  __jc_store_transpose = true;
+-  __jc_store_original = true;
+-  __no_jacobian_store = false;
+-
+-  __focal_normalize = true;
+-  __depth_normalize = true;
+-  __depth_degeneracy_fix = true;
+-  __jacobian_normalize = true;
+-  __data_normalize_median = 0.5f;
+-  __depth_check_epsilon = 0.01f;
+-
+-  ////////////////////////////
+-  __multiply_jx_usenoj = true;
+-
+-  ////////////////////////////
+-  __accurate_gain_ratio = true;
+-  ////////////////////////////
+-  __cpu_data_precision = 0;
+-  __current_device = -1;
+-  __selected_device = -1;
+-  __memory_usage = 0;
+-  __current_iteration = 0;
+-  __num_cpu_thread_all = 0;
+-
+-  ///////////////////////
+-  __debug_pba = false;
+-  __profile_pba = 0;
+-  __cpu_thread_profile = false;
+-  __warmup_device = false;
+-
+-  ///////////////////////
+-  __driver_output = NULL;
+-
+-  //////////////////////////
+-  ResetBundleStatistics();
+-}
+-
+-void ConfigBA::ResetBundleStatistics() {
+-  __abort_flag = false;
+-  __num_lm_success = 0;
+-  __num_lm_iteration = 0;
+-  __num_cg_iteration = 0;
+-  __num_projection_eval = 0;
+-  __num_jacobian_eval = 0;
+-  __num_camera_modified = 0;
+-  __num_point_behind = 0;
+-  __initial_mse = 0;
+-  __final_mse = 0;
+-  __final_mse_x = 0;
+-  __focal_scaling = 1.0f;
+-  __depth_scaling = 1.0f;
+-  __pba_return_code = 0;
+-  __current_iteration = 0;
+-  __warmup_device = false;
+-  __bundle_current_mode = __bundle_mode_next;
+-  for (int i = 0; i < NUM_TIMER; ++i) __timer_record[i] = 0;
+-  __bundle_records.resize(0);
+-  if (__num_cpu_thread_all) {
+-    std::cout << "WARNING: set all thread number to " << __num_cpu_thread_all
+-              << '\n';
+-    for (int i = 0; i < NUM_FUNC; ++i)
+-      __num_cpu_thread[i] = __num_cpu_thread_all;
+-  }
+-}
+-
+-void ConfigBA::ResetTemporarySetting() {
+-  __reset_initial_distortion = false;
+-  __bundle_time_budget = 0;
+-  __bundle_mode_next = 0;
+-  __bundle_current_mode = 0;
+-  __stat_filename = NULL;
+-  if (__lm_damping_auto_switch > 0 && !__lm_use_diagonal_damp)
+-    __lm_use_diagonal_damp = true;
+-}
+-
+-void ConfigBA::SaveBundleStatistics(int ncam, int npt, int nproj) {
+-  if (__profile_pba) return;
+-  if (__stat_filename && __bundle_records.size() > 0) {
+-    char filenamebuf[1024];
+-    char* ret = strchr(__stat_filename, '\r');
+-    if (ret) ret[0] = 0;
+-    char* dot = strrchr(__stat_filename, '.');
+-    if (dot && strchr(dot, '/') == NULL && strchr(dot, '\\') == NULL)
+-      strcpy(filenamebuf, __stat_filename);  // if filename has extension, use
+-                                             // it
+-    else
+-      sprintf(filenamebuf, "%s%s%s%s%s%s%s%s%s.%s", __stat_filename,
+-              __cpu_data_precision == 0 ? "_gpu" : "_cpu",
+-              __cpu_data_precision == sizeof(double) ? "d" : "",
+-              __cg_schur_complement ? "_schur" : "\0",
+-              __lm_use_diagonal_damp
+-                  ? "\0"
+-                  : (__lm_damping_auto_switch > 0 ? "_ad" : "_id"),
+-              __use_radial_distortion == -1
+-                  ? "_md"
+-                  : (__use_radial_distortion ? "_pd" : "\0"),
+-              __jacobian_normalize ? "\0" : "_nojn",
+-              __focal_normalize || __depth_normalize ? "\0" : "_nodn",
+-              __depth_degeneracy_fix ? "\0" : "_nodf",
+-              __matlab_format_stat ? "m" : "log");
+-
+-    ///////////////////////////////////////////////////////
+-    ofstream out(filenamebuf);
+-    out << std::left;
+-
+-    float overhead =
+-        (BundleTimerGet(TIMER_OVERALL) - BundleTimerGet(TIMER_OPTIMIZATION));
+-    if (__matlab_format_stat)
+-      out << "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"
+-          << "ncam = " << ncam << "; npt = " << npt << "; nproj = " << nproj
+-          << ";\n"
+-          << "%% overhead = " << overhead << ";\n"
+-          << "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"
+-          << "%% " << std::setw(10) << __num_lm_iteration
+-          << "\t linear systems solved;\n"
+-          << "%% " << std::setw(10) << __num_cg_iteration
+-          << "\t conjugated gradient steps;\n"
+-          << "%% " << std::setw(10) << BundleTimerGet(TIMER_OVERALL)
+-          << "\t seconds used overall;\n"
+-          << "%% " << std::setw(10) << BundleTimerGet(TIMER_PREPROCESSING)
+-          << "\t seconds on pre-processing;\n"
+-          << "%% " << std::setw(10)
+-          << BundleTimerGet(TIMER_GPU_UPLOAD) +
+-                 BundleTimerGet(TIMER_GPU_ALLOCATION)
+-          << "\t seconds on upload;\n"
+-          << "%% " << std::setw(10) << BundleTimerGet(TIMER_OPTIMIZATION)
+-          << "\t seconds on optimization;\n"
+-          << "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"
+-          << (__cpu_data_precision == 0 ? "gpustat" : "cpustat")
+-          << (__cpu_data_precision == sizeof(double) ? "_db" : "")
+-          << (__jacobian_normalize ? "" : "_nojn")
+-          << (__depth_degeneracy_fix ? "" : "_nodf")
+-          << (__cg_schur_complement ? "_schur" : "") << " = [\n";
+-
+-    for (size_t i = 0; i < __bundle_records.size(); ++i)
+-      out << std::setw((i % 7 > 2) ? ((i % 7 > 4 && !__save_gradient_norm &&
+-                                       !__lm_check_gradient)
+-                                          ? 0
+-                                          : 12)
+-                                   : 5)
+-          << (__bundle_records[i] + (i == 1 ? overhead : 0))
+-          << (i % 7 == 6 ? '\n' : '\t');
+-
+-    if (__matlab_format_stat) out << "];\n\n";
+-
+-    if (__verbose_level)
+-      std::cout << "\n---------------------------------------\n" << filenamebuf;
+-  }
+-}
+-
+-#define REPORT_FUNCTION_TIME(FID)                                         \
+-  std::setw(5) << (((int)(BundleTimerGet(FID) * 100 + 50)) * 0.01) << "(" \
+-               << std::setw(2)                                            \
+-               << 0.1f * ((int)(1000 * BundleTimerGet(FID) /              \
+-                                BundleTimerGet(TIMER_OPTIMIZATION)))      \
+-               << "%)"
+-
+-void ConfigBA::PrintBundleStatistics() {
+-  if (__profile_pba) return;
+-
+-  if (__verbose_level)
+-    std::cout << "\n---------------------------------------\n" << std::setw(10)
+-              << __num_lm_success << "\t successful iterations;\n"
+-              << std::setw(10) << __num_lm_iteration
+-              << "\t linear systems solved;\n" << std::setw(10)
+-              << __num_cg_iteration << "\t conjugated gradient steps;\n"
+-              << std::setw(10) << BundleTimerGet(TIMER_OVERALL)
+-              << "\t seconds used overall;\n" << std::setw(10)
+-              << BundleTimerGet(TIMER_GPU_ALLOCATION)
+-              << "\t seconds on allocation;\n" << std::setw(10)
+-              << BundleTimerGet(TIMER_PREPROCESSING)
+-              << "\t seconds on pre-processing;\n" << std::setw(10)
+-              << BundleTimerGet(TIMER_GPU_UPLOAD) << "\t seconds on upload;\n"
+-              << std::setw(10) << BundleTimerGet(TIMER_OPTIMIZATION)
+-              << "\t seconds on optimization;\n";
+-  if (__verbose_level && __cpu_data_precision)
+-    std::cout << REPORT_FUNCTION_TIME(TIMER_FUNCTION_JJ)
+-              << "\t seconds on jacobians;\n"
+-              << REPORT_FUNCTION_TIME(TIMER_FUNCTION_PJ)
+-              << "\t seconds on projections;\n"
+-              << REPORT_FUNCTION_TIME(TIMER_FUNCTION_JX)
+-              << "\t seconds on JX;\n"
+-              << REPORT_FUNCTION_TIME(TIMER_FUNCTION_JTE)
+-              << "\t seconds on JtE;\n"
+-              << REPORT_FUNCTION_TIME(TIMER_FUNCTION_BC)
+-              << "\t seconds to compute preconditioner;\n"
+-              << REPORT_FUNCTION_TIME(TIMER_FUNCTION_MP)
+-              << "\t seconds to apply preconditioner;\n"
+-              << REPORT_FUNCTION_TIME(TIMER_FUNCTION_UP)
+-              << "\t seconds to update parameters;\n";
+-  if (__verbose_level)
+-    std::cout << "---------------------------------------\n"
+-              << "mse = " << __initial_mse << " -> " << __final_mse << ""
+-              << "  (" << __final_mse_x
+-              << (__use_radial_distortion == -1 ? 'D' : 'U') << ")\n"
+-              << "---------------------------------------\n";
+-}
+-
+-double ConfigBA::MyClock() {
+-#ifdef _WIN32
+-  return clock() / double(CLOCKS_PER_SEC);
+-#else
+-  static int started = 0;
+-  static struct timeval tstart;
+-  if (started == 0) {
+-    gettimeofday(&tstart, NULL);
+-    started = 1;
+-    return 0;
+-  } else {
+-    struct timeval now;
+-    gettimeofday(&now, NULL);
+-    return ((now.tv_usec - tstart.tv_usec) / 1000000.0 +
+-            (now.tv_sec - tstart.tv_sec));
+-  }
+-#endif
+-}
+-
+-void ConfigBA::BundleTimerStart(int timer) {
+-  __timer_record[timer] = MyClock();
+-}
+-
+-void ConfigBA::BundleTimerSwitch(int timer) {
+-  __timer_record[timer] = MyClock() - __timer_record[timer];
+-}
+-
+-void ConfigBA::BundleTimerSwap(int timer1, int timer2) {
+-  BundleTimerSwitch(timer1);
+-  BundleTimerSwitch(timer2);
+-}
+-
+-float ConfigBA::BundleTimerGet(int timer) {
+-  return float(__timer_record[timer]);
+-}
+-
+-float ConfigBA::BundleTimerGetNow(int timer) {
+-  return 0.01f * ((int)(100 * (MyClock() - __timer_record[timer])));
+-}
+-
+-bool ConfigBA::IsTimeBudgetAvailable() {
+-  if (__bundle_time_budget <= 0) return true;
+-  return BundleTimerGetNow(TIMER_OVERALL) < __bundle_time_budget;
+-}
+-
+-void ConfigBA::SaveBundleRecord(int iter, float res, float damping, float gn,
+-                                float gi) {
+-  __bundle_records.push_back(float(iter));
+-  __bundle_records.push_back(BundleTimerGetNow());
+-  __bundle_records.push_back(float(__num_cg_iteration));
+-  __bundle_records.push_back(res);
+-  __bundle_records.push_back(damping);
+-  __bundle_records.push_back(gn);
+-  __bundle_records.push_back(gi);
+-}
+-
+-void ConfigBA::ParseParam(int argc, char** argv) {
+-#define CHAR1_TO_INT(x) ((x >= 'A' && x <= 'Z') ? x + 32 : x)
+-#define CHAR2_TO_INT(str, i) \
+-  (str[i] ? CHAR1_TO_INT(str[i]) + (CHAR1_TO_INT(str[i + 1]) << 8) : 0)
+-#define CHAR3_TO_INT(str, i) \
+-  (str[i] ? CHAR1_TO_INT(str[i]) + (CHAR2_TO_INT(str, i + 1) << 8) : 0)
+-#define STRING_TO_INT(str) (CHAR1_TO_INT(str[0]) + (CHAR3_TO_INT(str, 1) << 8))
+-
+-#ifdef _MSC_VER
+-// charizing is microsoft only
+-#define MAKEINT1(a) (#@ a)
+-#define sscanf sscanf_s
+-#else
+-#define mychar0 '0'
+-#define mychar1 '1'
+-#define mychar2 '2'
+-#define mychar3 '3'
+-#define mychara 'a'
+-#define mycharb 'b'
+-#define mycharc 'c'
+-#define mychard 'd'
+-#define mychare 'e'
+-#define mycharf 'f'
+-#define mycharg 'g'
+-#define mycharh 'h'
+-#define mychari 'i'
+-#define mycharj 'j'
+-#define mychark 'k'
+-#define mycharl 'l'
+-#define mycharm 'm'
+-#define mycharn 'n'
+-#define mycharo 'o'
+-#define mycharp 'p'
+-#define mycharq 'q'
+-#define mycharr 'r'
+-#define mychars 's'
+-#define mychart 't'
+-#define mycharu 'u'
+-#define mycharv 'v'
+-#define mycharw 'w'
+-#define mycharx 'x'
+-#define mychary 'y'
+-#define mycharz 'z'
+-#define MAKEINT1(a) (mychar##a)
+-#endif
+-#define MAKEINT2(a, b) (MAKEINT1(a) + (MAKEINT1(b) << 8))
+-#define MAKEINT3(a, b, c) (MAKEINT1(a) + (MAKEINT2(b, c) << 8))
+-#define MAKEINT4(a, b, c, d) (MAKEINT1(a) + (MAKEINT3(b, c, d) << 8))
+-
+-  char *arg, *param, *opt;
+-  int opti, argi;
+-  float argf;
+-  for (int i = 0; i < argc; i++) {
+-    arg = argv[i];
+-    if (arg == NULL || arg[0] != '-' || !arg[1]) continue;
+-    opt = arg + 1;
+-    opti = STRING_TO_INT(opt);
+-    param = argv[i + 1];
+-
+-    ////////////////////////////////
+-    switch (opti) {
+-      case MAKEINT3(l, m, i):
+-        if (i + 1 < argc && sscanf(param, "%d", &argi) && argi > 0)
+-          __lm_max_iteration = argi;
+-        break;
+-      case MAKEINT3(l, m, d):
+-        if (i + 1 < argc && sscanf(param, "%f", &argf) && argf >= 0)
+-          __lm_delta_threshold = argf;
+-        break;
+-      case MAKEINT3(l, m, e):
+-        if (i + 1 < argc && sscanf(param, "%f", &argf) && argf >= 0)
+-          __lm_mse_threshold = argf;
+-        break;
+-      case MAKEINT3(l, m, g):
+-        if (i + 1 < argc && sscanf(param, "%f", &argf) && argf > 0)
+-          __lm_gradient_threshold = argf;
+-        break;
+-      case MAKEINT4(d, a, m, p):
+-        if (i + 1 < argc && sscanf(param, "%f", &argf) && argf > 0)
+-          __lm_initial_damp = argf;
+-        break;
+-      case MAKEINT4(d, m, i, n):
+-        if (i + 1 < argc && sscanf(param, "%f", &argf) && argf > 0)
+-          __lm_minimum_damp = argf;
+-        break;
+-      case MAKEINT4(d, m, a, x):
+-        if (i + 1 < argc && sscanf(param, "%f", &argf) && argf > 0)
+-          __lm_maximum_damp = argf;
+-        break;
+-      case MAKEINT3(c, g, i):
+-        if (i + 1 < argc && sscanf(param, "%d", &argi) && argi > 0)
+-          __cg_max_iteration = argi;
+-        break;
+-      case MAKEINT4(c, g, i, m):
+-        if (i + 1 < argc && sscanf(param, "%d", &argi) && argi > 0)
+-          __cg_min_iteration = argi;
+-        break;
+-      case MAKEINT3(c, g, n):
+-        if (i + 1 < argc && sscanf(param, "%f", &argf) && argf > 0)
+-          __cg_norm_threshold = argf;
+-        break;
+-      case MAKEINT3(c, g, g):
+-        if (i + 1 < argc && sscanf(param, "%f", &argf) && argf > 0)
+-          __cg_norm_guard = argf;
+-        break;
+-      case MAKEINT4(c, g, r, f):
+-        if (i + 1 < argc && sscanf(param, "%d", &argi) && argi > 0)
+-          __cg_recalculate_freq = argi;
+-        break;
+-      case MAKEINT1(v):
+-        if (i + 1 < argc && sscanf(param, "%d", &argi) && argi >= 0)
+-          __verbose_level = argi;
+-        break;
+-      case MAKEINT4(d, e, v, i):
+-        if (i + 1 < argc && sscanf(param, "%d", &argi) && argi >= 0)
+-          __selected_device = argi;
+-        break;
+-      case MAKEINT4(b, u, d, g):
+-        if (i + 1 < argc && sscanf(param, "%d", &argi) && argi >= 0)
+-          __bundle_time_budget = argi;
+-        break;
+-      case MAKEINT3(e, x, p):
+-        if (i + 1 < argc && sscanf(param, "%d", &argi) && argi >= 0)
+-          __pba_experimental = argi;
+-        break;
+-      case MAKEINT4(t, n, u, m):
+-        if (i + 1 < argc && sscanf(param, "%d", &argi) && argi > 0)
+-          __num_cpu_thread_all = argi;
+-        break;
+-      case MAKEINT4(p, r, o, f):
+-        __profile_pba = (i + 1 < argc && sscanf(param, "%d", &argi))
+-                            ? std::max(10, argi)
+-                            : 100;
+-        break;
+-      case MAKEINT4(t, p, r, o):
+-        __cpu_thread_profile = true;
+-        break;
+-      case MAKEINT4(c, a, l, i):
+-        __fixed_intrinsics = true;
+-        break;
+-      case MAKEINT4(s, c, h, u):
+-      case MAKEINT4(s, s, o, r):
+-        __cg_schur_complement = true;
+-        break;
+-      case MAKEINT2(m, d):
+-      case MAKEINT4(r, a, d, i):
+-        __use_radial_distortion = -1;
+-        break;
+-      case MAKEINT2(p, d):
+-        __use_radial_distortion = 1;
+-        break;
+-      case MAKEINT3(r, 0, 0):
+-        __reset_initial_distortion = true;
+-        break;
+-      case MAKEINT4(v, a, r, i):
+-        __fixed_intrinsics = false;
+-        break;
+-      case MAKEINT4(n, a, c, c):
+-        __accurate_gain_ratio = false;
+-        break;
+-      case MAKEINT4(v, c, g, i):
+-        __verbose_cg_iteration = true;
+-        break;
+-      case MAKEINT4(v, f, u, n):
+-        __verbose_function_time = true;
+-        break;
+-      case MAKEINT4(v, a, l, l):
+-        __verbose_allocation = true;
+-        break;
+-      case MAKEINT4(v, s, s, e):
+-        __verbose_sse = true;
+-        break;
+-      case MAKEINT4(s, v, g, n):
+-        __save_gradient_norm = true;
+-        break;
+-      case MAKEINT2(i, d):
+-        __lm_use_diagonal_damp = false;
+-        break;
+-      case MAKEINT3(d, a, s):
+-        if (i + 1 < argc && sscanf(param, "%f", &argf) && argf > 0)
+-          __lm_damping_auto_switch = std::max(argf, 0.1f);
+-        else
+-          __lm_damping_auto_switch = 2.0f;
+-        break;
+-      case MAKEINT4(c, h, k, g):
+-        __lm_check_gradient = true;
+-        break;
+-      case MAKEINT4(n, o, j, n):
+-        __jacobian_normalize = false;
+-        break;
+-      case MAKEINT2(n, j):
+-        __no_jacobian_store = true;
+-      case MAKEINT3(n, j, c):
+-        __jc_store_transpose = false;
+-        __jc_store_original = false;
+-        break;
+-      case MAKEINT4(n, j, c, o):
+-        __jc_store_original = false;
+-        break;
+-      case MAKEINT4(n, j, c, t):
+-        __jc_store_transpose = false;
+-        break;
+-      case MAKEINT3(j, x, j):
+-        __multiply_jx_usenoj = false;
+-        break;
+-      case MAKEINT4(j, x, n, j):
+-        __multiply_jx_usenoj = true;
+-        break;
+-      case MAKEINT4(n, o, d, n):
+-        __depth_normalize = false;
+-        __focal_normalize = false;
+-        break;
+-      case MAKEINT4(n, o, d, f):
+-        __depth_degeneracy_fix = false;
+-        break;
+-      case MAKEINT4(n, o, r, m):
+-        if (i + 1 < argc && sscanf(param, "%f", &argf) && argf > 0)
+-          __data_normalize_median = argf;
+-        break;
+-      case MAKEINT3(d, c, e):
+-        if (i + 1 < argc && sscanf(param, "%f", &argf) && argf > 0 &&
+-            argf <= 0.01)
+-          __depth_check_epsilon = argf;
+-        break;
+-      case MAKEINT4(d, e, b, u):
+-        __debug_pba = true;
+-        break;
+-      case MAKEINT4(e, v, a, l):
+-        __lm_max_iteration = 100;
+-        __warmup_device = true;
+-      case MAKEINT4(s, t, a, t):
+-        __stat_filename = (i + 1 < argc && param[0] != '-') ? param : NULL;
+-        break;
+-      case MAKEINT3(o, u, t):
+-        __driver_output = (i + 1 < argc && param[0] != '-') ? param : NULL;
+-        break;
+-      case MAKEINT4(w, a, r, m):
+-        __warmup_device = true;
+-        break;
+-      case MAKEINT4(m, o, t, i):
+-        __bundle_mode_next = 1;
+-        break;
+-      case MAKEINT4(s, t, r, u):
+-        __bundle_mode_next = 2;
+-        break;
+-    }
+-  }
+-}
+-
+-}  // namespace pba
+diff --git a/lib/PBA/ConfigBA.h b/lib/PBA/ConfigBA.h
+deleted file mode 100644
+index 74bd52439..000000000
+--- a/lib/PBA/ConfigBA.h
++++ /dev/null
+@@ -1,226 +0,0 @@
+-////////////////////////////////////////////////////////////////////////////
+-//  File:       ConfigBA.h
+-//  Author:       Changchang Wu (ccwu@cs.washington.edu)
+-//  Description :   configuration object class
+-//
+-//  Copyright (c) 2011  Changchang Wu (ccwu@cs.washington.edu)
+-//    and the University of Washington at Seattle
+-//
+-//  This library is free software; you can redistribute it and/or
+-//  modify it under the terms of the GNU General Public
+-//  License as published by the Free Software Foundation; either
+-//  Version 3 of the License, or (at your option) any later version.
+-//
+-//  This library is distributed in the hope that it will be useful,
+-//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+-//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-//  General Public License for more details.
+-//
+-////////////////////////////////////////////////////////////////////////////////
+-
+-#ifndef CONFIG_BA_H
+-#define CONFIG_BA_H
+-#include <vector>
+-
+-namespace pba {
+-
+-class ConfigBA {
+- protected:
+-  enum {
+-    TIMER_OVERALL = 0,
+-    TIMER_OPTIMIZATION,
+-    TIMER_GPU_ALLOCATION,
+-    TIMER_GPU_UPLOAD,
+-    TIMER_PREPROCESSING,
+-    TIMER_GPU_DOWNLOAD,
+-    TIMER_CG_ITERATION,
+-    TIMER_LM_ITERATION,
+-    TIMER_FUNCTION_JJ,
+-    TIMER_FUNCTION_PJ,
+-    TIMER_FUNCTION_DD,
+-    TIMER_FUNCTION_JX,
+-    TIMER_FUNCTION_JTE,
+-    TIMER_FUNCTION_BC,
+-    TIMER_FUNCTION_MP,
+-    TIMER_FUNCTION_UP,
+-    TIMER_PROFILE_STEP,
+-    NUM_TIMER,
+-    FUNC_JX = 0,
+-    FUNC_JX_,
+-    FUNC_JTEC_JCT,
+-    FUNC_JTEC_JCO,
+-    FUNC_JTEP,
+-    FUNC_JTE_,
+-    FUNC_JJ_JCO_JCT_JP,
+-    FUNC_JJ_JCO_JP,
+-    FUNC_JJ_JCT_JP,
+-    FUNC_JJ_JP,
+-    FUNC_PJ,
+-    FUNC_BCC_JCT,
+-    FUNC_BCC_JCO,
+-    FUNC_BCP,
+-    FUNC_MPC,
+-    FUNC_MPP,
+-    FUNC_VS,
+-    FUNC_VV,
+-    NUM_FUNC
+-  };
+-  class TimerBA {
+-    ConfigBA* _config;
+-    int _timer;
+-
+-   public:
+-    TimerBA(ConfigBA* config, int timer) {
+-      (_config = config)->BundleTimerStart(_timer = timer);
+-    }
+-    TimerBA(ConfigBA* config, int timer, bool) {
+-      (_config = config)->BundleTimerSwitch(_timer = timer);
+-    }
+-    ~TimerBA() { _config->BundleTimerSwitch(_timer); }
+-  };
+-  friend class TimerBA;
+-
+- public:
+-  //////////////////////////////
+-  int __lm_max_iteration;      //(default 50)
+-  int __cg_max_iteration;      //(default 100)
+-  int __cg_min_iteration;      //(default 10)
+-  int __cg_recalculate_freq;   //(default 0)
+-  bool __accurate_gain_ratio;  //(default true) accurate gain ratio for
+-                               //approximate solutions
+-
+-  //////////////////////////////
+-  float __lm_delta_threshold;     //(default 1e-6)|dx|_2, I use absolute (not
+-                                  //relative) change
+-  float __lm_gradient_threshold;  //(default 1e-10)|Jt * e|_inf
+-  float __lm_mse_threshold;  //(default 0.25) quit if MSE is equal to or smaller
+-                             //than this
+-  float __lm_initial_damp;   //(default 0.001)initial damping factor
+-  float __lm_minimum_damp;   //(default 1e-10)minimum damping factor
+-  float __lm_maximum_damp;
+-  float __cg_norm_threshold;  //(default 0.1)terminate CG if norm ratio is less
+-                              //than threshold
+-  float __cg_norm_guard;      //(default 1.0)abort cg when norm increases to
+-  int __pba_experimental;
+-  bool __cg_schur_complement;
+-
+-  //////////////////////////////
+-  bool __lm_check_gradient;  //(default false) check g_inf for convergence
+-  float __lm_damping_auto_switch;
+-  bool __lm_use_diagonal_damp;  //(default true)use (Jt*J + lambda * diag(Jt*J))
+-                                //= Jt * e
+-  //            or  use (Jt*J + lambda * I) = Jt * e
+-  bool __fixed_intrinsics;      //(default false) set true for calibrated camera
+-                                //system
+-  int __use_radial_distortion;  //(default 0, 1 for projection distortion, 2 for
+-                                //measurement distortion)
+-  bool __reset_initial_distortion;  //(default false) reset the initial
+-                                    //distortio to 0
+-
+-  ////////////////////////////
+-  int __verbose_level;  //(default 2) how many messages to print out
+-  bool __abort_flag;    //(default false)abort the bundle adjustment loop if set
+-                        //true
+-  bool __verbose_cg_iteration;   //(default false)print out details of Conjugate
+-                                 //Gradients
+-  bool __verbose_function_time;  //(default false)print timing of some key
+-                                 //functions
+-  bool __save_gradient_norm;  //(default false)save |Jt * e|_2 of each iteration
+-  bool __verbose_allocation;  //(default false)whether print out allocation
+-                              //details
+-  bool __verbose_sse;         //(default false) show mse or sse
+-
+-  ///////////////////////////////////
+-  bool __jc_store_transpose;  //(default true) whether store transpose of JC
+-  bool __no_jacobian_store;   //(default false) whether use memory saving mode
+-  bool __jc_store_original;   //(default true) whether store original JC
+-
+-  ///////////////////////////////////
+-  bool __jacobian_normalize;  //(default true) scaling the jacobians according
+-                              //to initial jacobians
+-  bool __focal_normalize;     //(default true) data normalization
+-  bool __depth_normalize;     //(default true) data normalization
+-  bool __depth_degeneracy_fix;
+-  float __data_normalize_median;
+-  float __depth_check_epsilon;
+-  /////////////////////////////
+-
+- protected:
+-  bool __multiply_jx_usenoj;  // for debug purpose
+- protected:
+-  /////////////////////////////
+-  int __selected_device;
+-  int __cpu_data_precision;
+-  int __bundle_time_budget;
+-  int __bundle_mode_next;
+-  int __bundle_current_mode;
+-  //////////////////////////////
+-  float __initial_mse;
+-  float __final_mse;
+-  float __final_mse_x;
+-  float __focal_scaling;
+-  float __depth_scaling;
+-  int __current_device;
+-  int __current_iteration;
+-  int __num_cg_iteration;
+-  int __num_lm_success;
+-  int __num_lm_iteration;
+-  int __num_projection_eval;
+-  int __num_jacobian_eval;
+-  int __num_camera_modified;
+-  int __num_point_behind;
+-  int __pba_return_code;
+-  int __recent_cg_status;
+-  int __profile_pba;
+-  bool __cpu_thread_profile;
+-  bool __debug_pba;
+-  bool __warmup_device;
+-  size_t __memory_usage;
+-  /////////////////////////////////////
+-  bool __matlab_format_stat;
+-  char* __stat_filename;
+-  const char* __driver_output;
+-  std::vector<float> __bundle_records;
+-  double __timer_record[NUM_TIMER];
+-  int __num_cpu_thread_all;
+-  int __num_cpu_thread[NUM_FUNC];
+-
+- protected:
+-  ConfigBA();
+-  ///////////////////////////////
+-  void ResetTemporarySetting();
+-  void ResetBundleStatistics();
+-  void PrintBundleStatistics();
+-  void SaveBundleStatistics(int ncam, int npt, int nproj);
+-  ///////////////////////////////////////
+-  void BundleTimerStart(int timer);
+-  void BundleTimerSwitch(int timer);
+-  float BundleTimerGet(int timer);
+-  void BundleTimerSwap(int timer1, int timer2);
+-  float BundleTimerGetNow(int timer = TIMER_OPTIMIZATION);
+-  /////////////////////////////////
+-  void SaveBundleRecord(int iter, float res, float damping, float gn, float gi);
+-  bool IsTimeBudgetAvailable();
+-  double MyClock();
+-
+- public:
+-  void ParseParam(int argc, char** argv);
+-
+- public:
+-  // the following are to be called after finishing BA
+-  const char* GetOutputParam() { return __driver_output; }
+-  float GetInitialMSE() { return __initial_mse; }
+-  float GetFinalMSE() { return __final_mse; }
+-  double GetBundleTiming(int timer = TIMER_OVERALL) {
+-    return __timer_record[timer];
+-  }
+-  int GetIterationsLM() { return __num_lm_iteration; }
+-  int GetIterationsCG() { return __num_cg_iteration; }
+-  int GetCurrentDevice() { return __current_device; }
+-  int GetBundleReturnCode() { return __pba_return_code; }
+-  int GetActiveDevice() { return __selected_device; }
+-};
+-
+-}  // namespace pba
+-
+-#endif
+diff --git a/lib/PBA/CuTexImage.cpp b/lib/PBA/CuTexImage.cpp
+deleted file mode 100644
+index 400a0f3..0000000
+--- a/lib/PBA/CuTexImage.cpp
++++ /dev/null
+@@ -1,137 +0,0 @@
+-////////////////////////////////////////////////////////////////////////////
+-//  File:           CuTexImage.cpp
+-//  Author:         Changchang Wu
+-//  Description :   implementation of the CuTexImage class.
+-//
+-//  Copyright (c) 2011  Changchang Wu (ccwu@cs.washington.edu)
+-//    and the University of Washington at Seattle
+-//
+-//  This library is free software; you can redistribute it and/or
+-//  modify it under the terms of the GNU General Public
+-//  License as published by the Free Software Foundation; either
+-//  Version 3 of the License, or (at your option) any later version.
+-//
+-//  This library is distributed in the hope that it will be useful,
+-//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+-//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-//  General Public License for more details.
+-//
+-////////////////////////////////////////////////////////////////////////////////
+-
+-#include <iostream>
+-#include <algorithm>
+-#include <stdlib.h>
+-#include <vector>
+-#include <fstream>
+-using namespace std;
+-
+-#include <cuda.h>
+-#include <cuda_runtime_api.h>
+-#include "CuTexImage.h"
+-
+-#if CUDA_VERSION <= 2010
+-#error "Require CUDA 2.2 or higher"
+-#endif
+-
+-namespace pba {
+-
+-CuTexImage::CuTexImage() {
+-  _owner = true;
+-  _cuData = NULL;
+-  _numBytes = _numChannel = 0;
+-  _imgWidth = _imgHeight = 0;
+-}
+-
+-CuTexImage::~CuTexImage() {
+-  if (_cuData && _owner) cudaFree(_cuData);
+-}
+-
+-void CuTexImage::ReleaseData() {
+-  if (_cuData && _owner) cudaFree(_cuData);
+-  _cuData = NULL;
+-  _numBytes = 0;
+-}
+-
+-void CuTexImage::SwapData(CuTexImage& src) {
+-  if (_cuData == src._cuData) return;
+-
+-  void* cuData = _cuData;
+-  unsigned int numChannel = _numChannel;
+-  unsigned int imgWidth = _imgWidth;
+-  unsigned int imgHeight = _imgHeight;
+-  bool owner = _owner;
+-  size_t numBytes = _numBytes;
+-
+-  _cuData = src._cuData;
+-  _numChannel = src._numChannel;
+-  _numBytes = src._numBytes;
+-  _imgWidth = src._imgWidth;
+-  _imgHeight = src._imgHeight;
+-  _owner = src._owner;
+-
+-  src._cuData = cuData;
+-  src._numChannel = numChannel;
+-  src._numBytes = numBytes;
+-  src._imgWidth = imgWidth;
+-  src._imgHeight = imgHeight;
+-  src._owner = owner;
+-}
+-
+-bool CuTexImage::InitTexture(unsigned int width, unsigned int height,
+-                             unsigned int nchannel) {
+-  size_t size = sizeof(float) * width * height * nchannel;
+-  _imgWidth = width;
+-  _imgHeight = height;
+-  _numChannel = nchannel;
+-
+-  if (size <= _numBytes) return true;
+-
+-  if (_cuData && _owner) cudaFree(_cuData);
+-
+-  // allocate the array data
+-  cudaError_t e = cudaMalloc(&_cuData, size);
+-  _numBytes = e == cudaSuccess ? size : 0;
+-  _owner = true;
+-  return e == cudaSuccess;
+-}
+-
+-void CuTexImage::SetTexture(void* data, unsigned int width,
+-                            unsigned int nchannel) {
+-  if (_cuData && _owner) cudaFree(_cuData);
+-  _imgWidth = width;
+-  _imgHeight = 1;
+-  _numChannel = nchannel;
+-  _numBytes = sizeof(float) * width * _imgHeight * _numChannel;
+-  _cuData = data;
+-  _owner = false;
+-}
+-
+-void CuTexImage::CopyFromHost(const void* buf) {
+-  if (_cuData == NULL || buf == NULL || GetDataSize() == 0) return;
+-  cudaMemcpy(_cuData, buf, _imgWidth * _imgHeight * _numChannel * sizeof(float),
+-             cudaMemcpyHostToDevice);
+-}
+-
+-void CuTexImage::CopyFromDevice(const void* buf) {
+-  if (_cuData == NULL) return;
+-  cudaMemcpy((char*)_cuData, buf,
+-             _imgWidth * _imgHeight * _numChannel * sizeof(float),
+-             cudaMemcpyDeviceToDevice);
+-}
+-
+-void CuTexImage::CopyToHost(void* buf) {
+-  if (_cuData == NULL) return;
+-  size_t sz = _imgWidth * _imgHeight * _numChannel * sizeof(float);
+-  // cudaThreadSynchronize();
+-  cudaMemcpy(buf, _cuData, sz, cudaMemcpyDeviceToHost);
+-  cudaThreadSynchronize();
+-}
+-
+-void CuTexImage::SaveToFile(const char* name) {
+-  ofstream out(name);
+-  vector<float> value(GetLength());
+-  CopyToHost(&value[0]);
+-  for (size_t i = 0; i < value.size(); ++i) out << value[i] << '\n';
+-}
+-
+-}  // namespace pba
+diff --git a/lib/PBA/CuTexImage.h b/lib/PBA/CuTexImage.h
+deleted file mode 100644
+index e53e566e7..000000000
+--- a/lib/PBA/CuTexImage.h
++++ /dev/null
+@@ -1,83 +0,0 @@
+-////////////////////////////////////////////////////////////////////////////
+-//  File:           CuTexImage.h
+-//  Author:         Changchang Wu
+-//  Description :   interface for the CuTexImage class.
+-//                  class for storing data in CUDA.
+-//
+-//  Copyright (c) 2011  Changchang Wu (ccwu@cs.washington.edu)
+-//    and the University of Washington at Seattle
+-//
+-//  This library is free software; you can redistribute it and/or
+-//  modify it under the terms of the GNU General Public
+-//  License as published by the Free Software Foundation; either
+-//  Version 3 of the License, or (at your option) any later version.
+-//
+-//  This library is distributed in the hope that it will be useful,
+-//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+-//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-//  General Public License for more details.
+-//
+-////////////////////////////////////////////////////////////////////////////////
+-
+-#ifndef CU_TEX_IMAGE_H
+-#define CU_TEX_IMAGE_H
+-
+-struct textureReference;
+-
+-namespace pba {
+-
+-class CuTexImage {
+- protected:
+-  bool _owner;
+-  void* _cuData;
+-  unsigned int _numChannel;
+-  unsigned int _imgWidth;
+-  unsigned int _imgHeight;
+-  size_t _numBytes;
+-
+- public:
+-  bool InitTexture(unsigned int width, unsigned int height,
+-                   unsigned int nchannel = 1);
+-  void SetTexture(void* data, unsigned int width, unsigned int nchannel = 1);
+-  void BindTexture(textureReference& texRef);
+-  void BindTexture(textureReference& texRef, int offset, size_t size);
+-  void BindTexture2(textureReference& texRef1, textureReference& texRef2);
+-  void BindTexture4(textureReference& texRef1, textureReference& texRef2,
+-                    textureReference& texRef3, textureReference& texRef4);
+-  int BindTextureX(textureReference& texRef1, textureReference& texRef2,
+-                   textureReference& texRef3, textureReference& texRef4,
+-                   bool force4);
+-  void SwapData(CuTexImage& src);
+-  void CopyToHost(void* buf);
+-  void CopyFromDevice(const void* buf);
+-  void CopyFromHost(const void* buf);
+-  void SaveToFile(const char* name);
+-  void ReleaseData();
+-
+- public:
+-  inline float* data() { return GetRequiredSize() ? ((float*)_cuData) : NULL; }
+-  inline bool IsValid() { return _cuData != NULL && GetDataSize() > 0; }
+-  inline unsigned int GetLength() {
+-    return _imgWidth * _imgHeight * _numChannel;
+-  }
+-  inline unsigned int GetImgWidth() { return _imgWidth; }
+-  inline unsigned int GetImgHeight() { return _imgHeight; }
+-  inline size_t GetReservedWidth() {
+-    return _numBytes == 0
+-               ? 0
+-               : (_numBytes / (_imgHeight * _numChannel * sizeof(float)));
+-  }
+-  inline size_t GetDataSize() { return _numBytes == 0 ? 0 : GetRequiredSize(); }
+-  inline size_t GetRequiredSize() {
+-    return sizeof(float) * _imgWidth * _imgHeight * _numChannel;
+-  }
+-  inline unsigned int IsHugeData() { return (GetLength() - 1) / (1 << 27); }
+-
+- public:
+-  CuTexImage();
+-  virtual ~CuTexImage();
+-};
+-
+-}  // namespace pba
+-
+-#endif  // !defined(CU_TEX_IMAGE_H)
+diff --git a/lib/PBA/DataInterface.h b/lib/PBA/DataInterface.h
+deleted file mode 100644
+index b465bd60a..000000000
+--- a/lib/PBA/DataInterface.h
++++ /dev/null
+@@ -1,423 +0,0 @@
+-////////////////////////////////////////////////////////////////////////////
+-//  File:       DataInterface.h
+-//  Author:       Changchang Wu (ccwu@cs.washington.edu)
+-//  Description :   data interface, the data format been uploaded to GPU
+-//
+-//  Copyright (c) 2011  Changchang Wu (ccwu@cs.washington.edu)
+-//    and the University of Washington at Seattle
+-//
+-//  This library is free software; you can redistribute it and/or
+-//  modify it under the terms of the GNU General Public
+-//  License as published by the Free Software Foundation; either
+-//  Version 3 of the License, or (at your option) any later version.
+-//
+-//  This library is distributed in the hope that it will be useful,
+-//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+-//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-//  General Public License for more details.
+-//
+-////////////////////////////////////////////////////////////////////////////////
+-
+-#ifndef DATA_INTERFACE_GPU_H
+-#define DATA_INTERFACE_GPU_H
+-
+-#include <math.h>
+-
+-// ----------------------------WARNING------------------------------
+-// -----------------------------------------------------------------
+-// ROTATION CONVERSION:
+-// The internal rotation representation is 3x3 float matrix. Reading
+-// back the rotations as quaternion or Rodrigues's representation will
+-// cause inaccuracy, IF you have wrongly reconstructed cameras with
+-// a very very large focal length (typically also very far away).
+-// In this case, any small change in the rotation matrix, will cause
+-// a large reprojection error.
+-//
+-// ---------------------------------------------------------------------
+-// RADIAL distortion is NOT enabled by default, use parameter "-md", -pd"
+-// or set ConfigBA::__use_radial_distortion to 1 or -1 to enable it.
+-// ---------------------------------------------------------------------------
+-
+-namespace pba {
+-
+-// transfer data type with 4-float alignment
+-#define CameraT CameraT_
+-#define Point3D Point3D_
+-template <class FT>
+-
+-struct CameraT_ {
+-  typedef FT float_t;
+-  //////////////////////////////////////////////////////
+-  float_t f;        // single focal length, K = [f, 0, 0; 0 f 0; 0 0 1]
+-  float_t t[3];     // T in  P = K[R T], T = - RC
+-  float_t m[3][3];  // R in  P = K[R T].
+-  float_t radial;   // WARNING: BE careful with the radial distortion model.
+-  int distortion_type;
+-  float_t constant_camera;
+-
+-  //////////////////////////////////////////////////////////
+-  CameraT_() {
+-    radial = 0;
+-    distortion_type = 0;
+-    constant_camera = 0;
+-  }
+-
+-  //////////////////////////////////////////////
+-  template <class CameraX>
+-  void SetCameraT(const CameraX& cam) {
+-    f = (float_t)cam.f;
+-    t[0] = (float_t)cam.t[0];
+-    t[1] = (float_t)cam.t[1];
+-    t[2] = (float_t)cam.t[2];
+-    for (int i = 0; i < 3; ++i)
+-      for (int j = 0; j < 3; ++j) m[i][j] = (float_t)cam.m[i][j];
+-    radial = (float_t)cam.radial;
+-    distortion_type = cam.distortion_type;
+-    constant_camera = cam.constant_camera;
+-  }
+-
+-  //////////////////////////////////////////
+-  void SetConstantCamera() { constant_camera = 1.0f; }
+-  void SetVariableCamera() { constant_camera = 0.0f; }
+-  void SetFixedIntrinsic() { constant_camera = 2.0f; }
+-  // void SetFixedExtrinsic() {constant_camera = 3.0f;}
+-
+-  //////////////////////////////////////
+-  template <class Float>
+-  void SetFocalLength(Float F) {
+-    f = (float_t)F;
+-  }
+-  float_t GetFocalLength() const { return f; }
+-
+-  template <class Float>
+-  void SetMeasurementDistortion(Float r) {
+-    radial = (float_t)r;
+-    distortion_type = -1;
+-  }
+-  float_t GetMeasurementDistortion() const {
+-    return distortion_type == -1 ? radial : 0;
+-  }
+-
+-  // normalize radial distortion that applies to angle will be (radial * f * f);
+-  template <class Float>
+-  void SetNormalizedMeasurementDistortion(Float r) {
+-    SetMeasurementDistortion(r / (f * f));
+-  }
+-  float_t GetNormalizedMeasurementDistortion() const {
+-    return GetMeasurementDistortion() * (f * f);
+-  }
+-
+-  // use projection distortion
+-  template <class Float>
+-  void SetProjectionDistortion(Float r) {
+-    radial = float_t(r);
+-    distortion_type = 1;
+-  }
+-  template <class Float>
+-  void SetProjectionDistortion(const Float* r) {
+-    SetProjectionDistortion(r[0]);
+-  }
+-  float_t GetProjectionDistortion() const {
+-    return distortion_type == 1 ? radial : 0;
+-  }
+-
+-  template <class Float>
+-  void SetRodriguesRotation(const Float r[3]) {
+-    double a = sqrt(r[0] * r[0] + r[1] * r[1] + r[2] * r[2]);
+-    double ct = a == 0.0 ? 0.5 : (1.0 - cos(a)) / a / a;
+-    double st = a == 0.0 ? 1 : sin(a) / a;
+-    m[0][0] = float_t(1.0 - (r[1] * r[1] + r[2] * r[2]) * ct);
+-    m[0][1] = float_t(r[0] * r[1] * ct - r[2] * st);
+-    m[0][2] = float_t(r[2] * r[0] * ct + r[1] * st);
+-    m[1][0] = float_t(r[0] * r[1] * ct + r[2] * st);
+-    m[1][1] = float_t(1.0 - (r[2] * r[2] + r[0] * r[0]) * ct);
+-    m[1][2] = float_t(r[1] * r[2] * ct - r[0] * st);
+-    m[2][0] = float_t(r[2] * r[0] * ct - r[1] * st);
+-    m[2][1] = float_t(r[1] * r[2] * ct + r[0] * st);
+-    m[2][2] = float_t(1.0 - (r[0] * r[0] + r[1] * r[1]) * ct);
+-  }
+-  template <class Float>
+-  void GetRodriguesRotation(Float r[3]) const {
+-    double a = (m[0][0] + m[1][1] + m[2][2] - 1.0) / 2.0;
+-    const double epsilon = 0.01;
+-    if (fabs(m[0][1] - m[1][0]) < epsilon &&
+-        fabs(m[1][2] - m[2][1]) < epsilon &&
+-        fabs(m[0][2] - m[2][0]) < epsilon) {
+-      if (fabs(m[0][1] + m[1][0]) < 0.1 && fabs(m[1][2] + m[2][1]) < 0.1 &&
+-          fabs(m[0][2] + m[2][0]) < 0.1 && a > 0.9) {
+-        r[0] = 0;
+-        r[1] = 0;
+-        r[2] = 0;
+-      } else {
+-        const Float ha = Float(sqrt(0.5) * 3.14159265358979323846);
+-        double xx = (m[0][0] + 1.0) / 2.0;
+-        double yy = (m[1][1] + 1.0) / 2.0;
+-        double zz = (m[2][2] + 1.0) / 2.0;
+-        double xy = (m[0][1] + m[1][0]) / 4.0;
+-        double xz = (m[0][2] + m[2][0]) / 4.0;
+-        double yz = (m[1][2] + m[2][1]) / 4.0;
+-
+-        if ((xx > yy) && (xx > zz)) {
+-          if (xx < epsilon) {
+-            r[0] = 0;
+-            r[1] = r[2] = ha;
+-          } else {
+-            double t = sqrt(xx);
+-            r[0] = Float(t * 3.14159265358979323846);
+-            r[1] = Float(xy / t * 3.14159265358979323846);
+-            r[2] = Float(xz / t * 3.14159265358979323846);
+-          }
+-        } else if (yy > zz) {
+-          if (yy < epsilon) {
+-            r[0] = r[2] = ha;
+-            r[1] = 0;
+-          } else {
+-            double t = sqrt(yy);
+-            r[0] = Float(xy / t * 3.14159265358979323846);
+-            r[1] = Float(t * 3.14159265358979323846);
+-            r[2] = Float(yz / t * 3.14159265358979323846);
+-          }
+-        } else {
+-          if (zz < epsilon) {
+-            r[0] = r[1] = ha;
+-            r[2] = 0;
+-          } else {
+-            double t = sqrt(zz);
+-            r[0] = Float(xz / t * 3.14159265358979323846);
+-            r[1] = Float(yz / t * 3.14159265358979323846);
+-            r[2] = Float(t * 3.14159265358979323846);
+-          }
+-        }
+-      }
+-    } else {
+-      a = acos(a);
+-      double b = 0.5 * a / sin(a);
+-      r[0] = Float(b * (m[2][1] - m[1][2]));
+-      r[1] = Float(b * (m[0][2] - m[2][0]));
+-      r[2] = Float(b * (m[1][0] - m[0][1]));
+-    }
+-  }
+-  ////////////////////////
+-  template <class Float>
+-  void SetQuaternionRotation(const Float q[4]) {
+-    double qq = sqrt(q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3]);
+-    double qw, qx, qy, qz;
+-    if (qq > 0) {
+-      qw = q[0] / qq;
+-      qx = q[1] / qq;
+-      qy = q[2] / qq;
+-      qz = q[3] / qq;
+-    } else {
+-      qw = 1;
+-      qx = qy = qz = 0;
+-    }
+-    m[0][0] = float_t(qw * qw + qx * qx - qz * qz - qy * qy);
+-    m[0][1] = float_t(2 * qx * qy - 2 * qz * qw);
+-    m[0][2] = float_t(2 * qy * qw + 2 * qz * qx);
+-    m[1][0] = float_t(2 * qx * qy + 2 * qw * qz);
+-    m[1][1] = float_t(qy * qy + qw * qw - qz * qz - qx * qx);
+-    m[1][2] = float_t(2 * qz * qy - 2 * qx * qw);
+-    m[2][0] = float_t(2 * qx * qz - 2 * qy * qw);
+-    m[2][1] = float_t(2 * qy * qz + 2 * qw * qx);
+-    m[2][2] = float_t(qz * qz + qw * qw - qy * qy - qx * qx);
+-  }
+-  template <class Float>
+-  void GetQuaternionRotation(Float q[4]) const {
+-    q[0] = 1 + m[0][0] + m[1][1] + m[2][2];
+-    if (q[0] > 0.000000001) {
+-      q[0] = sqrt(q[0]) / 2.0;
+-      q[1] = (m[2][1] - m[1][2]) / (4.0 * q[0]);
+-      q[2] = (m[0][2] - m[2][0]) / (4.0 * q[0]);
+-      q[3] = (m[1][0] - m[0][1]) / (4.0 * q[0]);
+-    } else {
+-      double s;
+-      if (m[0][0] > m[1][1] && m[0][0] > m[2][2]) {
+-        s = 2.0 * sqrt(1.0 + m[0][0] - m[1][1] - m[2][2]);
+-        q[1] = 0.25 * s;
+-        q[2] = (m[0][1] + m[1][0]) / s;
+-        q[3] = (m[0][2] + m[2][0]) / s;
+-        q[0] = (m[1][2] - m[2][1]) / s;
+-      } else if (m[1][1] > m[2][2]) {
+-        s = 2.0 * sqrt(1.0 + m[1][1] - m[0][0] - m[2][2]);
+-        q[1] = (m[0][1] + m[1][0]) / s;
+-        q[2] = 0.25 * s;
+-        q[3] = (m[1][2] + m[2][1]) / s;
+-        q[0] = (m[0][2] - m[2][0]) / s;
+-      } else {
+-        s = 2.0 * sqrt(1.0 + m[2][2] - m[0][0] - m[1][1]);
+-        q[1] = (m[0][2] + m[2][0]) / s;
+-        q[2] = (m[1][2] + m[2][1]) / s;
+-        q[3] = 0.25f * s;
+-        q[0] = (m[0][1] - m[1][0]) / s;
+-      }
+-    }
+-  }
+-  ////////////////////////////////////////////////
+-  template <class Float>
+-  void SetMatrixRotation(const Float* r) {
+-    int k = 0;
+-    for (int i = 0; i < 3; ++i) {
+-      for (int j = 0; j < 3; ++j) {
+-        m[i][j] = float_t(r[k++]);
+-      }
+-    }
+-  }
+-  template <class Float>
+-  void GetMatrixRotation(Float* r) const {
+-    int k = 0;
+-    for (int i = 0; i < 3; ++i) {
+-      for (int j = 0; j < 3; ++j) {
+-        r[k++] = Float(m[i][j]);
+-      }
+-    }
+-  }
+-  float GetRotationMatrixDeterminant() const {
+-    return m[0][0] * m[1][1] * m[2][2] + m[0][1] * m[1][2] * m[2][0] +
+-           m[0][2] * m[1][0] * m[2][1] - m[0][2] * m[1][1] * m[2][0] -
+-           m[0][1] * m[1][0] * m[2][2] - m[0][0] * m[1][2] * m[2][1];
+-  }
+-  ///////////////////////////////////////
+-  template <class Float>
+-  void SetTranslation(const Float T[3]) {
+-    t[0] = (float_t)T[0];
+-    t[1] = (float_t)T[1];
+-    t[2] = (float_t)T[2];
+-  }
+-  template <class Float>
+-  void GetTranslation(Float T[3]) const {
+-    T[0] = (Float)t[0];
+-    T[1] = (Float)t[1];
+-    T[2] = (Float)t[2];
+-  }
+-  /////////////////////////////////////////////
+-  template <class Float>
+-  void SetCameraCenterAfterRotation(const Float c[3]) {
+-    // t = - R * C
+-    for (int j = 0; j < 3; ++j)
+-      t[j] = -float_t(m[j][0] * c[0] + m[j][1] * c[1] + m[j][2] * c[2]);
+-  }
+-  template <class Float>
+-  void GetCameraCenter(Float c[3]) {
+-    // C = - R' * t
+-    for (int j = 0; j < 3; ++j)
+-      c[j] = -float_t(m[0][j] * t[0] + m[1][j] * t[1] + m[2][j] * t[2]);
+-  }
+-  ////////////////////////////////////////////
+-  template <class Float>
+-  void SetInvertedRT(const Float e[3], const Float T[3]) {
+-    SetRodriguesRotation(e);
+-    for (int i = 3; i < 9; ++i) m[0][i] = -m[0][i];
+-    SetTranslation(T);
+-    t[1] = -t[1];
+-    t[2] = -t[2];
+-  }
+-
+-  template <class Float>
+-  void GetInvertedRT(Float e[3], Float T[3]) const {
+-    CameraT ci;
+-    ci.SetMatrixRotation(m[0]);
+-    for (int i = 3; i < 9; ++i) ci.m[0][i] = -ci.m[0][i];
+-    // for(int i = 1; i < 3; ++i) for(int j = 0; j < 3; ++j) ci.m[i][j] = -
+-    // ci.m[i][j];
+-    ci.GetRodriguesRotation(e);
+-    GetTranslation(T);
+-    T[1] = -T[1];
+-    T[2] = -T[2];
+-  }
+-  template <class Float>
+-  void SetInvertedR9T(const Float e[9], const Float T[3]) {
+-    // for(int i = 0; i < 9; ++i) m[0][i] = (i < 3 ? e[i] : - e[i]);
+-    // SetTranslation(T); t[1] = - t[1]; t[2] = -t[2];
+-    m[0][0] = e[0];
+-    m[0][1] = e[1];
+-    m[0][2] = e[2];
+-    m[1][0] = -e[3];
+-    m[1][1] = -e[4];
+-    m[1][2] = -e[5];
+-    m[2][0] = -e[6];
+-    m[2][1] = -e[7];
+-    m[2][2] = -e[8];
+-    t[0] = T[0];
+-    t[1] = -T[1];
+-    t[2] = -T[2];
+-  }
+-  template <class Float>
+-  void GetInvertedR9T(Float e[9], Float T[3]) const {
+-    e[0] = m[0][0];
+-    e[1] = m[0][1];
+-    e[2] = m[0][2];
+-    e[3] = -m[1][0];
+-    e[4] = -m[1][1];
+-    e[5] = -m[1][2];
+-    e[6] = -m[2][0];
+-    e[7] = -m[2][1];
+-    e[8] = -m[2][2];
+-    T[0] = t[0];
+-    T[1] = -t[1];
+-    T[2] = -t[2];
+-  }
+-};
+-
+-template <class FT>
+-struct Point3D {
+-  typedef FT float_t;
+-  float_t xyz[3];  // 3D point location
+-  float_t reserved;  // alignment
+-  ////////////////////////////////
+-  template <class Float>
+-  void SetPoint(Float x, Float y, Float z) {
+-    xyz[0] = (float_t)x;
+-    xyz[1] = (float_t)y;
+-    xyz[2] = (float_t)z;
+-    reserved = 0;
+-  }
+-  template <class Float>
+-  void SetPoint(const Float* p) {
+-    xyz[0] = (float_t)p[0];
+-    xyz[1] = (float_t)p[1];
+-    xyz[2] = (float_t)p[2];
+-    reserved = 0;
+-  }
+-  template <class Float>
+-  void GetPoint(Float* p) const {
+-    p[0] = (Float)xyz[0];
+-    p[1] = (Float)xyz[1];
+-    p[2] = (Float)xyz[2];
+-  }
+-  template <class Float>
+-  void GetPoint(Float& x, Float& y, Float& z) const {
+-    x = (Float)xyz[0];
+-    y = (Float)xyz[1];
+-    z = (Float)xyz[2];
+-  }
+-};
+-
+-#undef CameraT
+-#undef Point3D
+-
+-typedef CameraT_<float> CameraT;
+-typedef Point3D_<float> Point3D;
+-
+-struct Point2D {
+-  float x, y;
+-  ////////////////////////////////////////////////////////
+-  Point2D() {}
+-  template <class Float>
+-  Point2D(Float X, Float Y) {
+-    SetPoint2D(X, Y);
+-  }
+-  template <class Float>
+-  void SetPoint2D(Float X, Float Y) {
+-    x = (float)X;
+-    y = (float)Y;
+-  }
+-  template <class Float>
+-  void GetPoint2D(Float& X, Float& Y) const {
+-    X = (Float)x;
+-    Y = (Float)y;
+-  }
+-};
+-
+-}  // namespace pba
+-
+-#endif
+diff --git a/lib/PBA/LICENSE b/lib/PBA/LICENSE
+deleted file mode 100755
+index 94a9ed024..000000000
+--- a/lib/PBA/LICENSE
++++ /dev/null
+@@ -1,674 +0,0 @@
+-                    GNU GENERAL PUBLIC LICENSE
+-                       Version 3, 29 June 2007
+-
+- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+- Everyone is permitted to copy and distribute verbatim copies
+- of this license document, but changing it is not allowed.
+-
+-                            Preamble
+-
+-  The GNU General Public License is a free, copyleft license for
+-software and other kinds of works.
+-
+-  The licenses for most software and other practical works are designed
+-to take away your freedom to share and change the works.  By contrast,
+-the GNU General Public License is intended to guarantee your freedom to
+-share and change all versions of a program--to make sure it remains free
+-software for all its users.  We, the Free Software Foundation, use the
+-GNU General Public License for most of our software; it applies also to
+-any other work released this way by its authors.  You can apply it to
+-your programs, too.
+-
+-  When we speak of free software, we are referring to freedom, not
+-price.  Our General Public Licenses are designed to make sure that you
+-have the freedom to distribute copies of free software (and charge for
+-them if you wish), that you receive source code or can get it if you
+-want it, that you can change the software or use pieces of it in new
+-free programs, and that you know you can do these things.
+-
+-  To protect your rights, we need to prevent others from denying you
+-these rights or asking you to surrender the rights.  Therefore, you have
+-certain responsibilities if you distribute copies of the software, or if
+-you modify it: responsibilities to respect the freedom of others.
+-
+-  For example, if you distribute copies of such a program, whether
+-gratis or for a fee, you must pass on to the recipients the same
+-freedoms that you received.  You must make sure that they, too, receive
+-or can get the source code.  And you must show them these terms so they
+-know their rights.
+-
+-  Developers that use the GNU GPL protect your rights with two steps:
+-(1) assert copyright on the software, and (2) offer you this License
+-giving you legal permission to copy, distribute and/or modify it.
+-
+-  For the developers' and authors' protection, the GPL clearly explains
+-that there is no warranty for this free software.  For both users' and
+-authors' sake, the GPL requires that modified versions be marked as
+-changed, so that their problems will not be attributed erroneously to
+-authors of previous versions.
+-
+-  Some devices are designed to deny users access to install or run
+-modified versions of the software inside them, although the manufacturer
+-can do so.  This is fundamentally incompatible with the aim of
+-protecting users' freedom to change the software.  The systematic
+-pattern of such abuse occurs in the area of products for individuals to
+-use, which is precisely where it is most unacceptable.  Therefore, we
+-have designed this version of the GPL to prohibit the practice for those
+-products.  If such problems arise substantially in other domains, we
+-stand ready to extend this provision to those domains in future versions
+-of the GPL, as needed to protect the freedom of users.
+-
+-  Finally, every program is threatened constantly by software patents.
+-States should not allow patents to restrict development and use of
+-software on general-purpose computers, but in those that do, we wish to
+-avoid the special danger that patents applied to a free program could
+-make it effectively proprietary.  To prevent this, the GPL assures that
+-patents cannot be used to render the program non-free.
+-
+-  The precise terms and conditions for copying, distribution and
+-modification follow.
+-
+-                       TERMS AND CONDITIONS
+-
+-  0. Definitions.
+-
+-  "This License" refers to version 3 of the GNU General Public License.
+-
+-  "Copyright" also means copyright-like laws that apply to other kinds of
+-works, such as semiconductor masks.
+-
+-  "The Program" refers to any copyrightable work licensed under this
+-License.  Each licensee is addressed as "you".  "Licensees" and
+-"recipients" may be individuals or organizations.
+-
+-  To "modify" a work means to copy from or adapt all or part of the work
+-in a fashion requiring copyright permission, other than the making of an
+-exact copy.  The resulting work is called a "modified version" of the
+-earlier work or a work "based on" the earlier work.
+-
+-  A "covered work" means either the unmodified Program or a work based
+-on the Program.
+-
+-  To "propagate" a work means to do anything with it that, without
+-permission, would make you directly or secondarily liable for
+-infringement under applicable copyright law, except executing it on a
+-computer or modifying a private copy.  Propagation includes copying,
+-distribution (with or without modification), making available to the
+-public, and in some countries other activities as well.
+-
+-  To "convey" a work means any kind of propagation that enables other
+-parties to make or receive copies.  Mere interaction with a user through
+-a computer network, with no transfer of a copy, is not conveying.
+-
+-  An interactive user interface displays "Appropriate Legal Notices"
+-to the extent that it includes a convenient and prominently visible
+-feature that (1) displays an appropriate copyright notice, and (2)
+-tells the user that there is no warranty for the work (except to the
+-extent that warranties are provided), that licensees may convey the
+-work under this License, and how to view a copy of this License.  If
+-the interface presents a list of user commands or options, such as a
+-menu, a prominent item in the list meets this criterion.
+-
+-  1. Source Code.
+-
+-  The "source code" for a work means the preferred form of the work
+-for making modifications to it.  "Object code" means any non-source
+-form of a work.
+-
+-  A "Standard Interface" means an interface that either is an official
+-standard defined by a recognized standards body, or, in the case of
+-interfaces specified for a particular programming language, one that
+-is widely used among developers working in that language.
+-
+-  The "System Libraries" of an executable work include anything, other
+-than the work as a whole, that (a) is included in the normal form of
+-packaging a Major Component, but which is not part of that Major
+-Component, and (b) serves only to enable use of the work with that
+-Major Component, or to implement a Standard Interface for which an
+-implementation is available to the public in source code form.  A
+-"Major Component", in this context, means a major essential component
+-(kernel, window system, and so on) of the specific operating system
+-(if any) on which the executable work runs, or a compiler used to
+-produce the work, or an object code interpreter used to run it.
+-
+-  The "Corresponding Source" for a work in object code form means all
+-the source code needed to generate, install, and (for an executable
+-work) run the object code and to modify the work, including scripts to
+-control those activities.  However, it does not include the work's
+-System Libraries, or general-purpose tools or generally available free
+-programs which are used unmodified in performing those activities but
+-which are not part of the work.  For example, Corresponding Source
+-includes interface definition files associated with source files for
+-the work, and the source code for shared libraries and dynamically
+-linked subprograms that the work is specifically designed to require,
+-such as by intimate data communication or control flow between those
+-subprograms and other parts of the work.
+-
+-  The Corresponding Source need not include anything that users
+-can regenerate automatically from other parts of the Corresponding
+-Source.
+-
+-  The Corresponding Source for a work in source code form is that
+-same work.
+-
+-  2. Basic Permissions.
+-
+-  All rights granted under this License are granted for the term of
+-copyright on the Program, and are irrevocable provided the stated
+-conditions are met.  This License explicitly affirms your unlimited
+-permission to run the unmodified Program.  The output from running a
+-covered work is covered by this License only if the output, given its
+-content, constitutes a covered work.  This License acknowledges your
+-rights of fair use or other equivalent, as provided by copyright law.
+-
+-  You may make, run and propagate covered works that you do not
+-convey, without conditions so long as your license otherwise remains
+-in force.  You may convey covered works to others for the sole purpose
+-of having them make modifications exclusively for you, or provide you
+-with facilities for running those works, provided that you comply with
+-the terms of this License in conveying all material for which you do
+-not control copyright.  Those thus making or running the covered works
+-for you must do so exclusively on your behalf, under your direction
+-and control, on terms that prohibit them from making any copies of
+-your copyrighted material outside their relationship with you.
+-
+-  Conveying under any other circumstances is permitted solely under
+-the conditions stated below.  Sublicensing is not allowed; section 10
+-makes it unnecessary.
+-
+-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+-
+-  No covered work shall be deemed part of an effective technological
+-measure under any applicable law fulfilling obligations under article
+-11 of the WIPO copyright treaty adopted on 20 December 1996, or
+-similar laws prohibiting or restricting circumvention of such
+-measures.
+-
+-  When you convey a covered work, you waive any legal power to forbid
+-circumvention of technological measures to the extent such circumvention
+-is effected by exercising rights under this License with respect to
+-the covered work, and you disclaim any intention to limit operation or
+-modification of the work as a means of enforcing, against the work's
+-users, your or third parties' legal rights to forbid circumvention of
+-technological measures.
+-
+-  4. Conveying Verbatim Copies.
+-
+-  You may convey verbatim copies of the Program's source code as you
+-receive it, in any medium, provided that you conspicuously and
+-appropriately publish on each copy an appropriate copyright notice;
+-keep intact all notices stating that this License and any
+-non-permissive terms added in accord with section 7 apply to the code;
+-keep intact all notices of the absence of any warranty; and give all
+-recipients a copy of this License along with the Program.
+-
+-  You may charge any price or no price for each copy that you convey,
+-and you may offer support or warranty protection for a fee.
+-
+-  5. Conveying Modified Source Versions.
+-
+-  You may convey a work based on the Program, or the modifications to
+-produce it from the Program, in the form of source code under the
+-terms of section 4, provided that you also meet all of these conditions:
+-
+-    a) The work must carry prominent notices stating that you modified
+-    it, and giving a relevant date.
+-
+-    b) The work must carry prominent notices stating that it is
+-    released under this License and any conditions added under section
+-    7.  This requirement modifies the requirement in section 4 to
+-    "keep intact all notices".
+-
+-    c) You must license the entire work, as a whole, under this
+-    License to anyone who comes into possession of a copy.  This
+-    License will therefore apply, along with any applicable section 7
+-    additional terms, to the whole of the work, and all its parts,
+-    regardless of how they are packaged.  This License gives no
+-    permission to license the work in any other way, but it does not
+-    invalidate such permission if you have separately received it.
+-
+-    d) If the work has interactive user interfaces, each must display
+-    Appropriate Legal Notices; however, if the Program has interactive
+-    interfaces that do not display Appropriate Legal Notices, your
+-    work need not make them do so.
+-
+-  A compilation of a covered work with other separate and independent
+-works, which are not by their nature extensions of the covered work,
+-and which are not combined with it such as to form a larger program,
+-in or on a volume of a storage or distribution medium, is called an
+-"aggregate" if the compilation and its resulting copyright are not
+-used to limit the access or legal rights of the compilation's users
+-beyond what the individual works permit.  Inclusion of a covered work
+-in an aggregate does not cause this License to apply to the other
+-parts of the aggregate.
+-
+-  6. Conveying Non-Source Forms.
+-
+-  You may convey a covered work in object code form under the terms
+-of sections 4 and 5, provided that you also convey the
+-machine-readable Corresponding Source under the terms of this License,
+-in one of these ways:
+-
+-    a) Convey the object code in, or embodied in, a physical product
+-    (including a physical distribution medium), accompanied by the
+-    Corresponding Source fixed on a durable physical medium
+-    customarily used for software interchange.
+-
+-    b) Convey the object code in, or embodied in, a physical product
+-    (including a physical distribution medium), accompanied by a
+-    written offer, valid for at least three years and valid for as
+-    long as you offer spare parts or customer support for that product
+-    model, to give anyone who possesses the object code either (1) a
+-    copy of the Corresponding Source for all the software in the
+-    product that is covered by this License, on a durable physical
+-    medium customarily used for software interchange, for a price no
+-    more than your reasonable cost of physically performing this
+-    conveying of source, or (2) access to copy the
+-    Corresponding Source from a network server at no charge.
+-
+-    c) Convey individual copies of the object code with a copy of the
+-    written offer to provide the Corresponding Source.  This
+-    alternative is allowed only occasionally and noncommercially, and
+-    only if you received the object code with such an offer, in accord
+-    with subsection 6b.
+-
+-    d) Convey the object code by offering access from a designated
+-    place (gratis or for a charge), and offer equivalent access to the
+-    Corresponding Source in the same way through the same place at no
+-    further charge.  You need not require recipients to copy the
+-    Corresponding Source along with the object code.  If the place to
+-    copy the object code is a network server, the Corresponding Source
+-    may be on a different server (operated by you or a third party)
+-    that supports equivalent copying facilities, provided you maintain
+-    clear directions next to the object code saying where to find the
+-    Corresponding Source.  Regardless of what server hosts the
+-    Corresponding Source, you remain obligated to ensure that it is
+-    available for as long as needed to satisfy these requirements.
+-
+-    e) Convey the object code using peer-to-peer transmission, provided
+-    you inform other peers where the object code and Corresponding
+-    Source of the work are being offered to the general public at no
+-    charge under subsection 6d.
+-
+-  A separable portion of the object code, whose source code is excluded
+-from the Corresponding Source as a System Library, need not be
+-included in conveying the object code work.
+-
+-  A "User Product" is either (1) a "consumer product", which means any
+-tangible personal property which is normally used for personal, family,
+-or household purposes, or (2) anything designed or sold for incorporation
+-into a dwelling.  In determining whether a product is a consumer product,
+-doubtful cases shall be resolved in favor of coverage.  For a particular
+-product received by a particular user, "normally used" refers to a
+-typical or common use of that class of product, regardless of the status
+-of the particular user or of the way in which the particular user
+-actually uses, or expects or is expected to use, the product.  A product
+-is a consumer product regardless of whether the product has substantial
+-commercial, industrial or non-consumer uses, unless such uses represent
+-the only significant mode of use of the product.
+-
+-  "Installation Information" for a User Product means any methods,
+-procedures, authorization keys, or other information required to install
+-and execute modified versions of a covered work in that User Product from
+-a modified version of its Corresponding Source.  The information must
+-suffice to ensure that the continued functioning of the modified object
+-code is in no case prevented or interfered with solely because
+-modification has been made.
+-
+-  If you convey an object code work under this section in, or with, or
+-specifically for use in, a User Product, and the conveying occurs as
+-part of a transaction in which the right of possession and use of the
+-User Product is transferred to the recipient in perpetuity or for a
+-fixed term (regardless of how the transaction is characterized), the
+-Corresponding Source conveyed under this section must be accompanied
+-by the Installation Information.  But this requirement does not apply
+-if neither you nor any third party retains the ability to install
+-modified object code on the User Product (for example, the work has
+-been installed in ROM).
+-
+-  The requirement to provide Installation Information does not include a
+-requirement to continue to provide support service, warranty, or updates
+-for a work that has been modified or installed by the recipient, or for
+-the User Product in which it has been modified or installed.  Access to a
+-network may be denied when the modification itself materially and
+-adversely affects the operation of the network or violates the rules and
+-protocols for communication across the network.
+-
+-  Corresponding Source conveyed, and Installation Information provided,
+-in accord with this section must be in a format that is publicly
+-documented (and with an implementation available to the public in
+-source code form), and must require no special password or key for
+-unpacking, reading or copying.
+-
+-  7. Additional Terms.
+-
+-  "Additional permissions" are terms that supplement the terms of this
+-License by making exceptions from one or more of its conditions.
+-Additional permissions that are applicable to the entire Program shall
+-be treated as though they were included in this License, to the extent
+-that they are valid under applicable law.  If additional permissions
+-apply only to part of the Program, that part may be used separately
+-under those permissions, but the entire Program remains governed by
+-this License without regard to the additional permissions.
+-
+-  When you convey a copy of a covered work, you may at your option
+-remove any additional permissions from that copy, or from any part of
+-it.  (Additional permissions may be written to require their own
+-removal in certain cases when you modify the work.)  You may place
+-additional permissions on material, added by you to a covered work,
+-for which you have or can give appropriate copyright permission.
+-
+-  Notwithstanding any other provision of this License, for material you
+-add to a covered work, you may (if authorized by the copyright holders of
+-that material) supplement the terms of this License with terms:
+-
+-    a) Disclaiming warranty or limiting liability differently from the
+-    terms of sections 15 and 16 of this License; or
+-
+-    b) Requiring preservation of specified reasonable legal notices or
+-    author attributions in that material or in the Appropriate Legal
+-    Notices displayed by works containing it; or
+-
+-    c) Prohibiting misrepresentation of the origin of that material, or
+-    requiring that modified versions of such material be marked in
+-    reasonable ways as different from the original version; or
+-
+-    d) Limiting the use for publicity purposes of names of licensors or
+-    authors of the material; or
+-
+-    e) Declining to grant rights under trademark law for use of some
+-    trade names, trademarks, or service marks; or
+-
+-    f) Requiring indemnification of licensors and authors of that
+-    material by anyone who conveys the material (or modified versions of
+-    it) with contractual assumptions of liability to the recipient, for
+-    any liability that these contractual assumptions directly impose on
+-    those licensors and authors.
+-
+-  All other non-permissive additional terms are considered "further
+-restrictions" within the meaning of section 10.  If the Program as you
+-received it, or any part of it, contains a notice stating that it is
+-governed by this License along with a term that is a further
+-restriction, you may remove that term.  If a license document contains
+-a further restriction but permits relicensing or conveying under this
+-License, you may add to a covered work material governed by the terms
+-of that license document, provided that the further restriction does
+-not survive such relicensing or conveying.
+-
+-  If you add terms to a covered work in accord with this section, you
+-must place, in the relevant source files, a statement of the
+-additional terms that apply to those files, or a notice indicating
+-where to find the applicable terms.
+-
+-  Additional terms, permissive or non-permissive, may be stated in the
+-form of a separately written license, or stated as exceptions;
+-the above requirements apply either way.
+-
+-  8. Termination.
+-
+-  You may not propagate or modify a covered work except as expressly
+-provided under this License.  Any attempt otherwise to propagate or
+-modify it is void, and will automatically terminate your rights under
+-this License (including any patent licenses granted under the third
+-paragraph of section 11).
+-
+-  However, if you cease all violation of this License, then your
+-license from a particular copyright holder is reinstated (a)
+-provisionally, unless and until the copyright holder explicitly and
+-finally terminates your license, and (b) permanently, if the copyright
+-holder fails to notify you of the violation by some reasonable means
+-prior to 60 days after the cessation.
+-
+-  Moreover, your license from a particular copyright holder is
+-reinstated permanently if the copyright holder notifies you of the
+-violation by some reasonable means, this is the first time you have
+-received notice of violation of this License (for any work) from that
+-copyright holder, and you cure the violation prior to 30 days after
+-your receipt of the notice.
+-
+-  Termination of your rights under this section does not terminate the
+-licenses of parties who have received copies or rights from you under
+-this License.  If your rights have been terminated and not permanently
+-reinstated, you do not qualify to receive new licenses for the same
+-material under section 10.
+-
+-  9. Acceptance Not Required for Having Copies.
+-
+-  You are not required to accept this License in order to receive or
+-run a copy of the Program.  Ancillary propagation of a covered work
+-occurring solely as a consequence of using peer-to-peer transmission
+-to receive a copy likewise does not require acceptance.  However,
+-nothing other than this License grants you permission to propagate or
+-modify any covered work.  These actions infringe copyright if you do
+-not accept this License.  Therefore, by modifying or propagating a
+-covered work, you indicate your acceptance of this License to do so.
+-
+-  10. Automatic Licensing of Downstream Recipients.
+-
+-  Each time you convey a covered work, the recipient automatically
+-receives a license from the original licensors, to run, modify and
+-propagate that work, subject to this License.  You are not responsible
+-for enforcing compliance by third parties with this License.
+-
+-  An "entity transaction" is a transaction transferring control of an
+-organization, or substantially all assets of one, or subdividing an
+-organization, or merging organizations.  If propagation of a covered
+-work results from an entity transaction, each party to that
+-transaction who receives a copy of the work also receives whatever
+-licenses to the work the party's predecessor in interest had or could
+-give under the previous paragraph, plus a right to possession of the
+-Corresponding Source of the work from the predecessor in interest, if
+-the predecessor has it or can get it with reasonable efforts.
+-
+-  You may not impose any further restrictions on the exercise of the
+-rights granted or affirmed under this License.  For example, you may
+-not impose a license fee, royalty, or other charge for exercise of
+-rights granted under this License, and you may not initiate litigation
+-(including a cross-claim or counterclaim in a lawsuit) alleging that
+-any patent claim is infringed by making, using, selling, offering for
+-sale, or importing the Program or any portion of it.
+-
+-  11. Patents.
+-
+-  A "contributor" is a copyright holder who authorizes use under this
+-License of the Program or a work on which the Program is based.  The
+-work thus licensed is called the contributor's "contributor version".
+-
+-  A contributor's "essential patent claims" are all patent claims
+-owned or controlled by the contributor, whether already acquired or
+-hereafter acquired, that would be infringed by some manner, permitted
+-by this License, of making, using, or selling its contributor version,
+-but do not include claims that would be infringed only as a
+-consequence of further modification of the contributor version.  For
+-purposes of this definition, "control" includes the right to grant
+-patent sublicenses in a manner consistent with the requirements of
+-this License.
+-
+-  Each contributor grants you a non-exclusive, worldwide, royalty-free
+-patent license under the contributor's essential patent claims, to
+-make, use, sell, offer for sale, import and otherwise run, modify and
+-propagate the contents of its contributor version.
+-
+-  In the following three paragraphs, a "patent license" is any express
+-agreement or commitment, however denominated, not to enforce a patent
+-(such as an express permission to practice a patent or covenant not to
+-sue for patent infringement).  To "grant" such a patent license to a
+-party means to make such an agreement or commitment not to enforce a
+-patent against the party.
+-
+-  If you convey a covered work, knowingly relying on a patent license,
+-and the Corresponding Source of the work is not available for anyone
+-to copy, free of charge and under the terms of this License, through a
+-publicly available network server or other readily accessible means,
+-then you must either (1) cause the Corresponding Source to be so
+-available, or (2) arrange to deprive yourself of the benefit of the
+-patent license for this particular work, or (3) arrange, in a manner
+-consistent with the requirements of this License, to extend the patent
+-license to downstream recipients.  "Knowingly relying" means you have
+-actual knowledge that, but for the patent license, your conveying the
+-covered work in a country, or your recipient's use of the covered work
+-in a country, would infringe one or more identifiable patents in that
+-country that you have reason to believe are valid.
+-
+-  If, pursuant to or in connection with a single transaction or
+-arrangement, you convey, or propagate by procuring conveyance of, a
+-covered work, and grant a patent license to some of the parties
+-receiving the covered work authorizing them to use, propagate, modify
+-or convey a specific copy of the covered work, then the patent license
+-you grant is automatically extended to all recipients of the covered
+-work and works based on it.
+-
+-  A patent license is "discriminatory" if it does not include within
+-the scope of its coverage, prohibits the exercise of, or is
+-conditioned on the non-exercise of one or more of the rights that are
+-specifically granted under this License.  You may not convey a covered
+-work if you are a party to an arrangement with a third party that is
+-in the business of distributing software, under which you make payment
+-to the third party based on the extent of your activity of conveying
+-the work, and under which the third party grants, to any of the
+-parties who would receive the covered work from you, a discriminatory
+-patent license (a) in connection with copies of the covered work
+-conveyed by you (or copies made from those copies), or (b) primarily
+-for and in connection with specific products or compilations that
+-contain the covered work, unless you entered into that arrangement,
+-or that patent license was granted, prior to 28 March 2007.
+-
+-  Nothing in this License shall be construed as excluding or limiting
+-any implied license or other defenses to infringement that may
+-otherwise be available to you under applicable patent law.
+-
+-  12. No Surrender of Others' Freedom.
+-
+-  If conditions are imposed on you (whether by court order, agreement or
+-otherwise) that contradict the conditions of this License, they do not
+-excuse you from the conditions of this License.  If you cannot convey a
+-covered work so as to satisfy simultaneously your obligations under this
+-License and any other pertinent obligations, then as a consequence you may
+-not convey it at all.  For example, if you agree to terms that obligate you
+-to collect a royalty for further conveying from those to whom you convey
+-the Program, the only way you could satisfy both those terms and this
+-License would be to refrain entirely from conveying the Program.
+-
+-  13. Use with the GNU Affero General Public License.
+-
+-  Notwithstanding any other provision of this License, you have
+-permission to link or combine any covered work with a work licensed
+-under version 3 of the GNU Affero General Public License into a single
+-combined work, and to convey the resulting work.  The terms of this
+-License will continue to apply to the part which is the covered work,
+-but the special requirements of the GNU Affero General Public License,
+-section 13, concerning interaction through a network will apply to the
+-combination as such.
+-
+-  14. Revised Versions of this License.
+-
+-  The Free Software Foundation may publish revised and/or new versions of
+-the GNU General Public License from time to time.  Such new versions will
+-be similar in spirit to the present version, but may differ in detail to
+-address new problems or concerns.
+-
+-  Each version is given a distinguishing version number.  If the
+-Program specifies that a certain numbered version of the GNU General
+-Public License "or any later version" applies to it, you have the
+-option of following the terms and conditions either of that numbered
+-version or of any later version published by the Free Software
+-Foundation.  If the Program does not specify a version number of the
+-GNU General Public License, you may choose any version ever published
+-by the Free Software Foundation.
+-
+-  If the Program specifies that a proxy can decide which future
+-versions of the GNU General Public License can be used, that proxy's
+-public statement of acceptance of a version permanently authorizes you
+-to choose that version for the Program.
+-
+-  Later license versions may give you additional or different
+-permissions.  However, no additional obligations are imposed on any
+-author or copyright holder as a result of your choosing to follow a
+-later version.
+-
+-  15. Disclaimer of Warranty.
+-
+-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+-
+-  16. Limitation of Liability.
+-
+-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+-SUCH DAMAGES.
+-
+-  17. Interpretation of Sections 15 and 16.
+-
+-  If the disclaimer of warranty and limitation of liability provided
+-above cannot be given local legal effect according to their terms,
+-reviewing courts shall apply local law that most closely approximates
+-an absolute waiver of all civil liability in connection with the
+-Program, unless a warranty or assumption of liability accompanies a
+-copy of the Program in return for a fee.
+-
+-                     END OF TERMS AND CONDITIONS
+-
+-            How to Apply These Terms to Your New Programs
+-
+-  If you develop a new program, and you want it to be of the greatest
+-possible use to the public, the best way to achieve this is to make it
+-free software which everyone can redistribute and change under these terms.
+-
+-  To do so, attach the following notices to the program.  It is safest
+-to attach them to the start of each source file to most effectively
+-state the exclusion of warranty; and each file should have at least
+-the "copyright" line and a pointer to where the full notice is found.
+-
+-    <one line to give the program's name and a brief idea of what it does.>
+-    Copyright (C) <year>  <name of author>
+-
+-    This program is free software: you can redistribute it and/or modify
+-    it under the terms of the GNU General Public License as published by
+-    the Free Software Foundation, either version 3 of the License, or
+-    (at your option) any later version.
+-
+-    This program is distributed in the hope that it will be useful,
+-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+-    GNU General Public License for more details.
+-
+-    You should have received a copy of the GNU General Public License
+-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-
+-Also add information on how to contact you by electronic and paper mail.
+-
+-  If the program does terminal interaction, make it output a short
+-notice like this when it starts in an interactive mode:
+-
+-    <program>  Copyright (C) <year>  <name of author>
+-    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+-    This is free software, and you are welcome to redistribute it
+-    under certain conditions; type `show c' for details.
+-
+-The hypothetical commands `show w' and `show c' should show the appropriate
+-parts of the General Public License.  Of course, your program's commands
+-might be different; for a GUI interface, you would use an "about box".
+-
+-  You should also get your employer (if you work as a programmer) or school,
+-if any, to sign a "copyright disclaimer" for the program, if necessary.
+-For more information on this, and how to apply and follow the GNU GPL, see
+-<http://www.gnu.org/licenses/>.
+-
+-  The GNU General Public License does not permit incorporating your program
+-into proprietary programs.  If your program is a subroutine library, you
+-may consider it more useful to permit linking proprietary applications with
+-the library.  If this is what you want to do, use the GNU Lesser General
+-Public License instead of this License.  But first, please read
+-<http://www.gnu.org/philosophy/why-not-lgpl.html>.
+diff --git a/lib/PBA/ProgramCU.cu b/lib/PBA/ProgramCU.cu
+deleted file mode 100644
+index 890c20f..0000000
+--- a/lib/PBA/ProgramCU.cu
++++ /dev/null
+@@ -1,3637 +0,0 @@
+-////////////////////////////////////////////////////////////////////////////
+-//  File:           ProgramCU.cu
+-//  Author:         Changchang Wu
+-//  Description :   implementation of ProgramCU and all CUDA kernels
+-//
+-//  Copyright (c) 2011  Changchang Wu (ccwu@cs.washington.edu)
+-//    and the University of Washington at Seattle
+-//
+-//  This library is free software; you can redistribute it and/or
+-//  modify it under the terms of the GNU General Public
+-//  License as published by the Free Software Foundation; either
+-//  Version 3 of the License, or (at your option) any later version.
+-//
+-//  This library is distributed in the hope that it will be useful,
+-//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+-//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-//  General Public License for more details.
+-//
+-////////////////////////////////////////////////////////////////////////////////
+-
+-#include <stdio.h>
+-#include <float.h>
+-#include "CuTexImage.h"
+-#include "ProgramCU.h"
+-
+-#define IMUL(X, Y) __mul24(X, Y)
+-#define FDIV(X, Y) __fdividef(X, Y)
+-#define FDIV2(X, Y) ((X) / (Y))
+-#define MAX_BLOCKLEN 65535
+-#define MAX_BLOCKLEN_ALIGN 65504
+-#define MAX_TEXSIZE (1 << 29)
+-#define TEX_TOOBIG4(sz) (sz >> 31)
+-#define REDUCTION_NBLOCK 32
+-
+-namespace pba {
+-
+-inline void CuTexImage::BindTexture(textureReference& texRef) {
+-  size_t sz = GetDataSize();
+-  if (sz > MAX_TEXSIZE)
+-    fprintf(stderr, "cudaBindTexture: %lX > %d\n", sz, MAX_TEXSIZE);
+-  cudaError_t e =
+-      cudaBindTexture(NULL, &texRef, data(), &texRef.channelDesc, sz);
+-}
+-
+-inline void CuTexImage::BindTexture(textureReference& texRef, int offset,
+-                                    size_t size) {
+-  cudaError_t e = cudaBindTexture(NULL, &texRef, (char*)_cuData + offset,
+-                                  &texRef.channelDesc, size);
+-  if (e) fprintf(stderr, "cudaBindTexture: none-zero offset\n");
+-}
+-
+-inline void CuTexImage::BindTexture2(textureReference& texRef1,
+-                                     textureReference& texRef2) {
+-  size_t sz = GetDataSize();
+-  if (sz <= MAX_TEXSIZE) {
+-    BindTexture(texRef1);
+-  } else {
+-    BindTexture(texRef1, 0, MAX_TEXSIZE);
+-    BindTexture(texRef2, MAX_TEXSIZE, sz - MAX_TEXSIZE);
+-  }
+-}
+-
+-inline void CuTexImage::BindTexture4(textureReference& texRef1,
+-                                     textureReference& texRef2,
+-                                     textureReference& texRef3,
+-                                     textureReference& texRef4) {
+-  size_t sz = GetDataSize();
+-  if (sz <= MAX_TEXSIZE) {
+-    BindTexture(texRef1);
+-  } else {
+-    BindTexture(texRef1, 0, MAX_TEXSIZE);
+-    if (sz <= 2 * MAX_TEXSIZE) {
+-      BindTexture(texRef2, MAX_TEXSIZE, sz - MAX_TEXSIZE);
+-    } else {
+-      BindTexture(texRef2, MAX_TEXSIZE, MAX_TEXSIZE);
+-      if (sz <= 3 * MAX_TEXSIZE) {
+-        BindTexture(texRef3, MAX_TEXSIZE * 2, sz - MAX_TEXSIZE * 2);
+-      } else {
+-        BindTexture(texRef3, MAX_TEXSIZE * 2, MAX_TEXSIZE);
+-        BindTexture(texRef4, MAX_TEXSIZE * 3, sz - MAX_TEXSIZE * 3);
+-      }
+-    }
+-  }
+-}
+-
+-inline int CuTexImage::BindTextureX(textureReference& texRef1,
+-                                    textureReference& texRef2,
+-                                    textureReference& texRef3,
+-                                    textureReference& texRef4, bool force4) {
+-  size_t szjc = GetDataSize();
+-  if (TEX_TOOBIG4(szjc)) {
+-    return 0;
+-  } else if (force4) {
+-    BindTexture4(texRef1, texRef2, texRef3, texRef4);
+-    return 4;
+-  } else if (szjc > 2 * MAX_TEXSIZE) {
+-    return 0;
+-  } else if (szjc > MAX_TEXSIZE) {
+-    BindTexture2(texRef1, texRef2);
+-    return 2;
+-  } else {
+-    BindTexture(texRef1);
+-    return 1;
+-  }
+-}
+-
+-void ProgramCU::FinishWorkCUDA() { cudaThreadSynchronize(); }
+-
+-int ProgramCU::CheckErrorCUDA(const char* location) {
+-  cudaError_t e = cudaGetLastError();
+-  if (e) {
+-    if (location) fprintf(stderr, "%s:\t", location);
+-    fprintf(stderr, "%s(%d)\n", cudaGetErrorString(e), e);
+-    throw location;
+-  } else {
+-    // fprintf(stderr, "%s:\n",  location);
+-    return 0;
+-  }
+-}
+-
+-inline void ProgramCU::GetBlockConfiguration(unsigned int nblock,
+-                                             unsigned int& bw,
+-                                             unsigned int& bh) {
+-  if (nblock <= MAX_BLOCKLEN) {
+-    bw = nblock;
+-    bh = 1;
+-  } else {
+-    bh = (nblock + MAX_BLOCKLEN_ALIGN - 1) / MAX_BLOCKLEN_ALIGN;
+-    bw = (nblock + bh - 1) / bh;
+-    bw = ((bw + 31) / 32) * 32;
+-    bh = (nblock + bw - 1) / bw;
+-  }
+-}
+-
+-void ProgramCU::ClearPreviousError() { cudaGetLastError(); }
+-
+-void ProgramCU::ResetCurrentDevice() {
+-  int device = 0;
+-  cudaGetDevice(&device);
+-  cudaDeviceReset();
+-  if (device > 0) cudaSetDevice(device);
+-}
+-
+-size_t ProgramCU::GetCudaMemoryCap() {
+-  int device;
+-  if (cudaGetDevice(&device) != cudaSuccess) return 0;
+-  cudaDeviceProp prop;
+-  if (cudaGetDeviceProperties(&prop, device) == cudaSuccess) {
+-    if (prop.major == 9999 && prop.minor == 9999) return 0;
+-    return prop.totalGlobalMem;
+-  } else
+-    return 0;
+-}
+-int ProgramCU::SetCudaDevice(int device) {
+-  int count = 0, device_used;
+-  if (cudaGetDeviceCount(&count) || count <= 0) {
+-    ProgramCU::CheckErrorCUDA("CheckCudaDevice");
+-    return 0;
+-  } else if (count == 1) {
+-    cudaDeviceProp deviceProp;
+-    if (cudaGetDeviceProperties(&deviceProp, 0) != cudaSuccess) {
+-      fprintf(stderr, "CheckCudaDevice: no device supporting CUDA.\n");
+-      return 0;
+-    }
+-    if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
+-      fprintf(stderr, "CheckCudaDevice: no device supporting CUDA.\n");
+-      return 0;
+-    }
+-  }
+-
+-  if (device > 0 && device < count) {
+-    cudaSetDevice(device);
+-    CheckErrorCUDA("cudaSetDevice\n");
+-  }
+-  cudaGetDevice(&device_used);
+-  if (device != device_used)
+-    fprintf(stderr,
+-            "ERROR:   Cannot set device to %d\n"
+-            "WARNING: Use  device-%d instead (out of %d)\n",
+-            device, device_used, count);
+-  return 1;
+-}
+-
+-#define WARP_REDUCTION_32(value)                                       \
+-  __syncthreads();                                                     \
+-  if (threadIdx.x < 16) value[threadIdx.x] += value[threadIdx.x + 16]; \
+-  if (threadIdx.x < 8) value[threadIdx.x] += value[threadIdx.x + 8];   \
+-  if (threadIdx.x < 4) value[threadIdx.x] += value[threadIdx.x + 4];   \
+-  if (threadIdx.x < 2) value[threadIdx.x] += value[threadIdx.x + 2];
+-
+-#define WARP_REDUCTION_64(value)                                       \
+-  __syncthreads();                                                     \
+-  if (threadIdx.x < 32) value[threadIdx.x] += value[threadIdx.x + 32]; \
+-  WARP_REDUCTION_32(value)
+-
+-#define WARP_REDUCTION_128(value)                                      \
+-  __syncthreads();                                                     \
+-  if (threadIdx.x < 64) value[threadIdx.x] += value[threadIdx.x + 64]; \
+-  WARP_REDUCTION_64(value)
+-
+-#define WARP_REDUCTION_256(value)                                        \
+-  __syncthreads();                                                       \
+-  if (threadIdx.x < 128) value[threadIdx.x] += value[threadIdx.x + 128]; \
+-  WARP_REDUCTION_128(value)
+-
+-__global__ void vector_max_kernel(const float* x, int len, int blen,
+-                                  float* result) {
+-  __shared__ float value[256];
+-  int bstart = blen * blockIdx.x;
+-  int start = bstart + threadIdx.x;
+-  int end = min(len, bstart + blen);
+-
+-  float v = 0;
+-  for (int i = start; i < end; i += blockDim.x) v = max(v, fabs(x[i]));
+-  value[threadIdx.x] = v;
+-  // reduce to the first two values
+-  __syncthreads();
+-  if (threadIdx.x < 128)
+-    value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 128]);
+-  __syncthreads();
+-  if (threadIdx.x < 64)
+-    value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 64]);
+-  __syncthreads();
+-  if (threadIdx.x < 32)
+-    value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 32]);
+-  if (threadIdx.x < 16)
+-    value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 16]);
+-  if (threadIdx.x < 8)
+-    value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 8]);
+-  if (threadIdx.x < 4)
+-    value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 4]);
+-  if (threadIdx.x < 2)
+-    value[threadIdx.x] = max(value[threadIdx.x], value[threadIdx.x + 2]);
+-  // write back
+-  if (threadIdx.x == 0) result[blockIdx.x] = max(value[0], value[1]);
+-}
+-
+-float ProgramCU::ComputeVectorMax(CuTexImage& vector, CuTexImage& buf) {
+-  const unsigned int nblock = 32;
+-  const unsigned int bsize = 256;
+-  int len = vector.GetLength();
+-  int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize;
+-
+-  ////////////////////////////////
+-  dim3 grid(nblock), block(bsize);
+-
+-  /////////////////////////////////
+-  buf.InitTexture(nblock, 1);
+-  vector_max_kernel<<<grid, block>>>(vector.data(), len, blen, buf.data());
+-  ProgramCU::CheckErrorCUDA("ComputeVectorMax");
+-
+-  float data[nblock], result = 0;
+-  buf.CopyToHost(data);
+-  for (unsigned int i = 0; i < nblock; ++i) result = max(result, data[i]);
+-  return result;
+-}
+-
+-__global__ void vector_norm_kernel(const float* x, int len, int blen,
+-                                   float* result) {
+-  __shared__ float value[256];
+-  int bstart = blen * blockIdx.x;
+-  int start = bstart + threadIdx.x;
+-  int end = min(len, bstart + blen);
+-
+-  float v = 0;
+-  for (int i = start; i < end; i += blockDim.x) {
+-    float temp = x[i];
+-    v += (temp * temp);
+-  }
+-  value[threadIdx.x] = v;
+-  // reduce to the first two values
+-  WARP_REDUCTION_256(value);
+-
+-  // write back
+-  if (threadIdx.x == 0) result[blockIdx.x] = (value[0] + value[1]);
+-}
+-
+-double ProgramCU::ComputeVectorNorm(CuTexImage& vector, CuTexImage& buf) {
+-  const unsigned int nblock = REDUCTION_NBLOCK;
+-  unsigned int bsize = 256;
+-  int len = vector.GetLength();
+-  int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize;
+-
+-  ////////////////////////////////
+-  dim3 grid(nblock), block(bsize);
+-
+-  /////////////////////////////////
+-  buf.InitTexture(nblock, 1);
+-  vector_norm_kernel<<<grid, block>>>(vector.data(), len, blen, buf.data());
+-  ProgramCU::CheckErrorCUDA("ComputeVectorNorm");
+-
+-  float data[nblock];
+-  buf.CopyToHost(data);
+-  double result = 0;
+-  for (unsigned int i = 0; i < nblock; ++i) result += data[i];
+-  return result;
+-}
+-
+-__global__ void vector_sum_kernel(const float* x, int len, int blen,
+-                                  float* result) {
+-  __shared__ float value[256];
+-  int bstart = blen * blockIdx.x;
+-  int start = bstart + threadIdx.x;
+-  int end = min(len, bstart + blen);
+-  float v = 0;
+-  for (int i = start; i < end; i += blockDim.x) v += x[i];
+-
+-  value[threadIdx.x] = v;
+-  // reduce to the first two values
+-  WARP_REDUCTION_256(value);
+-
+-  // write back
+-  if (threadIdx.x == 0) result[blockIdx.x] = (value[0] + value[1]);
+-}
+-
+-float ProgramCU::ComputeVectorSum(CuTexImage& vector, CuTexImage& buf,
+-                                  int skip) {
+-  const unsigned int nblock = REDUCTION_NBLOCK;
+-  unsigned int bsize = 256;
+-  int len = vector.GetLength() - skip;
+-  int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize;
+-
+-  ////////////////////////////////
+-  dim3 grid(nblock), block(bsize);
+-
+-  /////////////////////////////////
+-  buf.InitTexture(nblock, 1);
+-  vector_sum_kernel<<<grid, block>>>((vector.data()) + skip, len, blen,
+-                                     buf.data());
+-  ProgramCU::CheckErrorCUDA("ComputeVectorSum");
+-
+-  float data[nblock];
+-  buf.CopyToHost(data);
+-  double result = 0;
+-  for (unsigned int i = 0; i < nblock; ++i) result += data[i];
+-  return (float)result;
+-}
+-
+-__global__ void vector_dotproduct_kernel(const float* a, const float* b,
+-                                         int len, int blen, float* result) {
+-  __shared__ float value[256];
+-  int bstart = blen * blockIdx.x;
+-  int start = bstart + threadIdx.x;
+-  int end = min(len, bstart + blen);
+-
+-  float v = 0;
+-  for (int i = start; i < end; i += blockDim.x) v += (a[i] * b[i]);
+-  value[threadIdx.x] = v;
+-
+-  // reduce to the first two values
+-  WARP_REDUCTION_256(value);
+-
+-  // write back
+-  if (threadIdx.x == 0) result[blockIdx.x] = (value[0] + value[1]);
+-}
+-
+-double ProgramCU::ComputeVectorDot(CuTexImage& vector1, CuTexImage& vector2,
+-                                   CuTexImage& buf) {
+-  const unsigned int nblock = REDUCTION_NBLOCK;
+-  unsigned int bsize = 256;
+-  int len = vector1.GetLength();
+-  int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize;
+-
+-  ////////////////////////////////
+-  dim3 grid(nblock), block(bsize);
+-
+-  /////////////////////////////////
+-  buf.InitTexture(nblock, 1);
+-  vector_dotproduct_kernel<<<grid, block>>>(vector1.data(), vector2.data(), len,
+-                                            blen, buf.data());
+-  ProgramCU::CheckErrorCUDA("ComputeVectorDot");
+-
+-  float data[nblock];
+-  buf.CopyToHost(data);
+-
+-  double result = 0;
+-  for (unsigned int i = 0; i < nblock; ++i) result += data[i];
+-  return result;
+-}
+-
+-__global__ void vector_weighted_norm_kernel(const float* vec, const float* w,
+-                                            int len, int blen, float* result) {
+-  __shared__ float value[256];
+-  int bstart = blen * blockIdx.x;
+-  int start = bstart + threadIdx.x;
+-  int end = min(len, bstart + blen);
+-
+-  float v = 0;
+-  for (int i = start; i < end; i += blockDim.x) v += (vec[i] * w[i] * vec[i]);
+-  value[threadIdx.x] = v;
+-
+-  // reduce to the first two values
+-  WARP_REDUCTION_256(value);
+-
+-  // write back
+-  if (threadIdx.x == 0) result[blockIdx.x] = (value[0] + value[1]);
+-}
+-
+-double ProgramCU::ComputeVectorNormW(CuTexImage& vector, CuTexImage& weight,
+-                                     CuTexImage& buf) {
+-  if (weight.IsValid()) {
+-    const unsigned int nblock = REDUCTION_NBLOCK;
+-    unsigned int bsize = 256;
+-    int len = vector.GetLength();
+-    int blen = ((len + nblock - 1) / nblock + bsize - 1) / bsize * bsize;
+-
+-    ////////////////////////////////
+-    dim3 grid(nblock), block(bsize);
+-
+-    /////////////////////////////////
+-    buf.InitTexture(nblock, 1);
+-
+-    vector_weighted_norm_kernel<<<grid, block>>>(vector.data(), weight.data(),
+-                                                 len, blen, buf.data());
+-
+-    ProgramCU::CheckErrorCUDA("ComputeVectorNormW");
+-
+-    float data[nblock];
+-    buf.CopyToHost(data);
+-
+-    double result = 0;
+-    for (unsigned int i = 0; i < nblock; ++i) result += data[i];
+-    return result;
+-  } else {
+-    return ComputeVectorNorm(vector, buf);
+-  }
+-}
+-// given vector x, y, and a weight a
+-// return a * x + y
+-__global__ void saxpy_kernel(const float a, const float* x, const float* y,
+-                             float* result, unsigned int len) {
+-  unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+-  if (idx < len) result[idx] = a * x[idx] + y[idx];
+-}
+-
+-__global__ void saxpy_kernel_large(const float a, const float* x,
+-                                   const float* y, float* result,
+-                                   unsigned int len, unsigned int rowsz) {
+-  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
+-  if (idx < len) result[idx] = a * x[idx] + y[idx];
+-}
+-
+-void ProgramCU::ComputeSAXPY(float a, CuTexImage& texX, CuTexImage& texY,
+-                             CuTexImage& result) {
+-  unsigned int len = result.GetLength();
+-  unsigned int bsize = 128;
+-  unsigned int nblock = (len + bsize - 1) / bsize;
+-  if (nblock > MAX_BLOCKLEN) {
+-    unsigned int bw, bh;
+-    GetBlockConfiguration(nblock, bw, bh);
+-    dim3 grid(bw, bh), block(bsize);
+-    saxpy_kernel_large<<<grid, block>>>(a, texX.data(), texY.data(),
+-                                        result.data(), len, bw * bsize);
+-  } else {
+-    dim3 grid(nblock), block(bsize);
+-    saxpy_kernel<<<grid, block>>>(a, texX.data(), texY.data(), result.data(),
+-                                  len);
+-  }
+-  ProgramCU::CheckErrorCUDA("ComputeSAXPY");
+-}
+-
+-__global__ void sxypz_kernel_large(float a, const float* x, const float* y,
+-                                   const float* z, float* result,
+-                                   unsigned int len, unsigned int rowsz) {
+-  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
+-  if (idx < len) result[idx] = a * x[idx] * y[idx] + z[idx];
+-}
+-
+-void ProgramCU::ComputeSXYPZ(float a, CuTexImage& texX, CuTexImage& texY,
+-                             CuTexImage& texZ, CuTexImage& result) {
+-  if (texX.IsValid()) {
+-    unsigned int len = texX.GetLength();
+-    unsigned int bsize = 128;
+-    unsigned int nblock = (len + bsize - 1) / bsize;
+-    unsigned int bw, bh;
+-    GetBlockConfiguration(nblock, bw, bh);
+-    dim3 grid(bw, bh), block(bsize);
+-    sxypz_kernel_large<<<grid, block>>>(a, texX.data(), texY.data(),
+-                                        texZ.data(), result.data(), len,
+-                                        bw * bsize);
+-  } else {
+-    ComputeSAXPY(a, texY, texZ, result);
+-  }
+-}
+-
+-__global__ void vxy_kernel(const float* x, float* y, float* result,
+-                           unsigned int len) {
+-  unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+-  if (idx < len) result[idx] = x[idx] * y[idx];
+-}
+-
+-__global__ void vxy_kernel_large(const float* x, float* y, float* result,
+-                                 unsigned int len, unsigned int rowsz) {
+-  unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x + rowsz * blockIdx.y;
+-  if (idx < len) result[idx] = x[idx] * y[idx];
+-}
+-
+-void ProgramCU::ComputeVXY(CuTexImage& texX, CuTexImage& texY,
+-                           CuTexImage& result, unsigned int part,
+-                           unsigned int skip) {
+-  unsigned int len = part ? part : texX.GetLength();
+-  unsigned int bsize = 128;
+-  unsigned int nblock = (len + bsize - 1) / bsize;
+-  if (nblock > MAX_BLOCKLEN) {
+-    unsigned int bw, bh;
+-    GetBlockConfiguration(nblock, bw, bh);
+-    dim3 grid(bw, bh), block(bsize);
+-    vxy_kernel_large<<<grid, block>>>(texX.data() + skip, texY.data() + skip,
+-                                      result.data() + skip, len, bsize * bw);
+-  } else {
+-    dim3 grid(nblock), block(bsize);
+-    vxy_kernel<<<grid, block>>>(texX.data() + skip, texY.data() + skip,
+-                                result.data() + skip, len);
+-  }
+-  ProgramCU::CheckErrorCUDA("ComputeVXY");
+-}
+-
+-__global__ void sqrt_kernel_large(float* x, unsigned int len,
+-                                  unsigned int rowsz) {
+-  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
+-  if (idx < len) x[idx] = sqrt(x[idx]);
+-}
+-
+-void ProgramCU::ComputeSQRT(CuTexImage& tex) {
+-  unsigned int len = tex.GetLength();
+-  unsigned int bsize = 128;
+-  unsigned int nblock = (len + bsize - 1) / bsize;
+-  unsigned int bw, bh;
+-  GetBlockConfiguration(nblock, bw, bh);
+-  dim3 grid(bw, bh), block(bsize);
+-  sqrt_kernel_large<<<grid, block>>>(tex.data(), len, bw * bsize);
+-  ProgramCU::CheckErrorCUDA("ComputeSQRT");
+-}
+-
+-__global__ void rsqrt_kernel_large(float* x, unsigned int len,
+-                                   unsigned int rowsz) {
+-  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
+-  if (idx < len) x[idx] = x[idx] > 0 ? rsqrt(x[idx]) : 0;
+-}
+-
+-void ProgramCU::ComputeRSQRT(CuTexImage& tex) {
+-  unsigned int len = tex.GetLength();
+-  unsigned int bsize = 128;
+-  unsigned int nblock = (len + bsize - 1) / bsize;
+-  unsigned int bw, bh;
+-  GetBlockConfiguration(nblock, bw, bh);
+-  dim3 grid(bw, bh), block(bsize);
+-  rsqrt_kernel_large<<<grid, block>>>(tex.data(), len, bw * bsize);
+-
+-  ProgramCU::CheckErrorCUDA("ComputeRSQRT");
+-}
+-
+-__global__ void sax_kernel(const float a, const float* x, float* result,
+-                           unsigned int len) {
+-  unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+-  if (idx < len) result[idx] = a * x[idx];
+-}
+-
+-__global__ void sax_kernel_large(const float a, const float* x, float* result,
+-                                 unsigned int len, unsigned int rowsz) {
+-  unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz;
+-  if (idx < len) result[idx] = a * x[idx];
+-}
+-
+-void ProgramCU::ComputeSAX(float a, CuTexImage& texX, CuTexImage& result) {
+-  unsigned int len = texX.GetLength();
+-  unsigned int bsize = 128;
+-  unsigned int nblock = (len + bsize - 1) / bsize;
+-
+-  if (nblock > MAX_BLOCKLEN) {
+-    unsigned int bw, bh;
+-    GetBlockConfiguration(nblock, bw, bh);
+-    dim3 grid(bw, bh), block(bsize);
+-    sax_kernel_large<<<grid, block>>>(a, texX.data(), result.data(), len,
+-                                      bw * bsize);
+-  } else {
+-    dim3 grid(nblock), block(bsize);
+-    sax_kernel<<<grid, block>>>(a, texX.data(), result.data(), len);
+-  }
+-  ProgramCU::CheckErrorCUDA("ComputeSAX");
+-}
+-
+-#define JACOBIAN_FRT_KWIDTH 64
+-
+-texture<float4, 1, cudaReadModeElementType> tex_jacobian_cam;
+-texture<float4, 1, cudaReadModeElementType> tex_jacobian_pts;
+-texture<int2, 1, cudaReadModeElementType> tex_jacobian_idx;
+-texture<float2, 1, cudaReadModeElementType> tex_jacobian_meas;
+-texture<float4, 1, cudaReadModeElementType> tex_jacobian_sj;
+-texture<int, 1, cudaReadModeElementType> tex_jacobian_shuffle;
+-
+-#ifndef PBA_DISABLE_CONST_CAMERA
+-#define JACOBIAN_SET_JC_BEGIN if (r3.w == 0.0f) {
+-#define JFRT_SET_JC_END                       \
+-  }                                           \
+-  else {                                      \
+-    jc[jc_pos] = make_float4(0, 0, 0, 0);     \
+-    jc[jc_pos + 1] = make_float4(0, 0, 0, 0); \
+-    jc[jc_pos + 2] = make_float4(0, 0, 0, 0); \
+-    jc[jc_pos + 3] = make_float4(0, 0, 0, 0); \
+-  }
+-#define JACOBIAN_SET_JC_END \
+-  }                         \
+-  else {                    \
+-    jxc[0] = 0;             \
+-    jxc[1] = 0;             \
+-    jxc[2] = 0;             \
+-    jxc[3] = 0;             \
+-    jxc[4] = 0;             \
+-    jxc[5] = 0;             \
+-    jxc[6] = 0;             \
+-    jxc[7] = 0;             \
+-    jyc[0] = 0;             \
+-    jyc[1] = 0;             \
+-    jyc[2] = 0;             \
+-    jyc[3] = 0;             \
+-    jyc[4] = 0;             \
+-    jyc[5] = 0;             \
+-    jyc[6] = 0;             \
+-    jyc[7] = 0;             \
+-  }
+-#else
+-#define JACOBIAN_SET_JC_BEGIN
+-#define JFRT_SET_JC_END
+-#define JACOBIAN_SET_JC_END
+-#endif
+-
+-// projection model ei = K(RX + T)  - (1 + r * m^2) * m
+-template <bool md, bool pd, bool scaling, bool shuffle>
+-__global__ void jacobian_frt_kernel(float4* jc, float4* jp, int nproj, int ptx,
+-                                    int rowsz, float jic) {
+-  ////////////////////////////////
+-  int tidx = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz;
+-
+-  if (tidx >= nproj) return;
+-  int2 proj = tex1Dfetch(tex_jacobian_idx, tidx);
+-  int camera_pos = proj.x << 1;
+-
+-  __shared__ float rr_data[JACOBIAN_FRT_KWIDTH * 9];
+-  float* r = rr_data + IMUL(9, threadIdx.x);
+-  float4 ft = tex1Dfetch(tex_jacobian_cam, camera_pos);
+-  float4 r1 = tex1Dfetch(tex_jacobian_cam, camera_pos + 1);
+-  r[0] = r1.x;
+-  r[1] = r1.y;
+-  r[2] = r1.z;
+-  r[3] = r1.w;
+-  float4 r2 = tex1Dfetch(tex_jacobian_cam, camera_pos + 2);
+-  r[4] = r2.x;
+-  r[5] = r2.y;
+-  r[6] = r2.z;
+-  r[7] = r2.w;
+-  float4 r3 = tex1Dfetch(tex_jacobian_cam, camera_pos + 3);
+-  r[8] = r3.x;
+-
+-  float4 temp = tex1Dfetch(tex_jacobian_pts, proj.y);
+-  float m[3];
+-  m[0] = temp.x;
+-  m[1] = temp.y;
+-  m[2] = temp.z;
+-
+-  float x0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2];
+-  float y0 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2];
+-  float z0 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2];
+-  float f_p2 = FDIV(ft.x, z0 + ft.w);
+-  float p0_p2 = FDIV(x0 + ft.y, z0 + ft.w);
+-  float p1_p2 = FDIV(y0 + ft.z, z0 + ft.w);
+-
+-  // dp/dx = [f/p2  0      -f*p0/p2/p2]
+-  //        [0     f/p2   -f*p1/p2/p2]
+-  // dx/dw = [ 0  z -y]
+-  //        [-z  0  x]
+-  //        [ y -x  0]
+-  // R(dw) (x y z)' = (0 -z y)' dw0 + (z 0  -x)'dw1 + (-y x 0)'dw2
+-  int jc_pos;
+-  if (shuffle) {
+-    jc_pos = tex1Dfetch(tex_jacobian_shuffle, tidx) << 2;
+-  } else {
+-    jc_pos = tidx << 2;
+-  }
+-
+-  if (pd) {
+-    float rr1 = r3.y * p0_p2 * p0_p2;
+-    float rr2 = r3.y * p1_p2 * p1_p2;
+-    float f_p2_x = f_p2 * (1.0 + 3.0 * rr1 + rr2);
+-    float f_p2_y = f_p2 * (1.0 + 3.0 * rr2 + rr1);
+-    if (scaling == false) {
+-      if (jc) {
+-        JACOBIAN_SET_JC_BEGIN
+-        // float jic = (r3.w != 1.0f && r3.w != 2.0f) ? 1.0f : 0.0f;
+-        // float jec = (r3.w != 1.0f && r3.w != 3.0f) ? 1.0f : 0.0f;
+-        float jfc = jic * (1 + rr1 + rr2);
+-        float ft_x_pn = jic * ft.x * (p0_p2 * p0_p2 + p1_p2 * p1_p2);
+-        jc[jc_pos] = make_float4(p0_p2 * jfc, f_p2_x, 0, -f_p2_x * p0_p2);
+-        jc[jc_pos + 1] =
+-            make_float4(-f_p2_x * p0_p2 * y0, f_p2_x * (z0 + x0 * p0_p2),
+-                        -f_p2_x * y0, ft_x_pn * p0_p2);
+-        jc[jc_pos + 2] = make_float4(p1_p2 * jfc, 0, f_p2_y, -f_p2 * p1_p2);
+-        jc[jc_pos + 3] =
+-            make_float4(-f_p2_y * (z0 + y0 * p1_p2), f_p2_y * x0 * p1_p2,
+-                        f_p2_y * x0, ft_x_pn * p1_p2);
+-        JFRT_SET_JC_END
+-      }
+-      ////////////////////
+-      jp[(tidx << 1)] = make_float4(f_p2_x * (r[0] - r[6] * p0_p2),
+-                                    f_p2_x * (r[1] - r[7] * p0_p2),
+-                                    f_p2_x * (r[2] - r[8] * p0_p2), 0);
+-      jp[(tidx << 1) + 1] = make_float4(f_p2_y * (r[3] - r[6] * p1_p2),
+-                                        f_p2_y * (r[4] - r[7] * p1_p2),
+-                                        f_p2_y * (r[5] - r[8] * p1_p2), 0);
+-    } else {
+-      ////////////////////
+-      if (jc) {
+-        JACOBIAN_SET_JC_BEGIN
+-        float jfc = jic * (1 + rr1 + rr2);
+-        float ft_x_pn = jic * ft.x * (p0_p2 * p0_p2 + p1_p2 * p1_p2);
+-        float4 sc1 = tex1Dfetch(tex_jacobian_sj, proj.x);
+-        jc[jc_pos] = make_float4(p0_p2 * jfc * sc1.x, f_p2_x * sc1.y, 0,
+-                                 -f_p2_x * p0_p2 * sc1.w);
+-        jc[jc_pos + 2] = make_float4(p1_p2 * jfc * sc1.x, 0, f_p2_y * sc1.z,
+-                                     -f_p2_y * p1_p2 * sc1.w);
+-
+-        float4 sc2 = tex1Dfetch(tex_jacobian_sj, proj.x + 1);
+-        jc[jc_pos + 1] = make_float4(
+-            -sc2.x * f_p2_x * p0_p2 * y0, sc2.y * f_p2_x * (z0 + x0 * p0_p2),
+-            -sc2.z * f_p2_x * y0, ft_x_pn * p0_p2 * sc2.w);
+-        jc[jc_pos + 3] = make_float4(
+-            -sc2.x * f_p2_y * (z0 + y0 * p1_p2), sc2.y * f_p2_y * x0 * p1_p2,
+-            sc2.z * f_p2_y * x0, ft_x_pn * p1_p2 * sc2.w);
+-        JFRT_SET_JC_END
+-      }
+-
+-      float4 sc3 = tex1Dfetch(tex_jacobian_sj, proj.y + ptx);
+-      jp[(tidx << 1)] = make_float4(sc3.x * f_p2_x * (r[0] - r[6] * p0_p2),
+-                                    sc3.y * f_p2_x * (r[1] - r[7] * p0_p2),
+-                                    sc3.z * f_p2_x * (r[2] - r[8] * p0_p2), 0);
+-      jp[(tidx << 1) + 1] =
+-          make_float4(sc3.x * f_p2_y * (r[3] - r[6] * p1_p2),
+-                      sc3.y * f_p2_y * (r[4] - r[7] * p1_p2),
+-                      sc3.z * f_p2_y * (r[5] - r[8] * p1_p2), 0);
+-    }
+-  } else if (md) {
+-    if (scaling == false) {
+-      if (jc) {
+-        JACOBIAN_SET_JC_BEGIN
+-        float2 ms = tex1Dfetch(tex_jacobian_meas, tidx);
+-        float msn = (ms.x * ms.x + ms.y * ms.y) * jic;
+-        jc[jc_pos] = make_float4(p0_p2 * jic, f_p2, 0, -f_p2 * p0_p2);
+-        jc[jc_pos + 1] =
+-            make_float4(-f_p2 * p0_p2 * y0, f_p2 * (z0 + x0 * p0_p2),
+-                        -f_p2 * y0, -ms.x * msn);
+-        jc[jc_pos + 2] = make_float4(p1_p2 * jic, 0, f_p2, -f_p2 * p1_p2);
+-        jc[jc_pos + 3] = make_float4(-f_p2 * (z0 + y0 * p1_p2),
+-                                     f_p2 * x0 * p1_p2, f_p2 * x0, -ms.y * msn);
+-        JFRT_SET_JC_END
+-      }
+-      ////////////////////
+-      jp[(tidx << 1)] = make_float4(f_p2 * (r[0] - r[6] * p0_p2),
+-                                    f_p2 * (r[1] - r[7] * p0_p2),
+-                                    f_p2 * (r[2] - r[8] * p0_p2), 0);
+-      jp[(tidx << 1) + 1] = make_float4(f_p2 * (r[3] - r[6] * p1_p2),
+-                                        f_p2 * (r[4] - r[7] * p1_p2),
+-                                        f_p2 * (r[5] - r[8] * p1_p2), 0);
+-    } else {
+-      if (jc) {
+-        JACOBIAN_SET_JC_BEGIN
+-        float4 sc1 = tex1Dfetch(tex_jacobian_sj, proj.x);
+-        jc[jc_pos] = make_float4(p0_p2 * jic * sc1.x, f_p2 * sc1.y, 0,
+-                                 -f_p2 * p0_p2 * sc1.w);
+-        jc[jc_pos + 2] = make_float4(p1_p2 * jic * sc1.x, 0, f_p2 * sc1.z,
+-                                     -f_p2 * p1_p2 * sc1.w);
+-
+-        float4 sc2 = tex1Dfetch(tex_jacobian_sj, proj.x + 1);
+-        float2 ms = tex1Dfetch(tex_jacobian_meas, tidx);
+-        float msn = (ms.x * ms.x + ms.y * ms.y) * jic;
+-        jc[jc_pos + 1] = make_float4(-sc2.x * f_p2 * p0_p2 * y0,
+-                                     sc2.y * f_p2 * (z0 + x0 * p0_p2),
+-                                     -sc2.z * f_p2 * y0, -msn * ms.x * sc2.w);
+-        jc[jc_pos + 3] = make_float4(-sc2.x * f_p2 * (z0 + y0 * p1_p2),
+-                                     sc2.y * f_p2 * x0 * p1_p2,
+-                                     sc2.z * f_p2 * x0, -msn * ms.y * sc2.w);
+-        JFRT_SET_JC_END
+-      }
+-      float4 sc3 = tex1Dfetch(tex_jacobian_sj, proj.y + ptx);
+-      jp[(tidx << 1)] = make_float4(sc3.x * f_p2 * (r[0] - r[6] * p0_p2),
+-                                    sc3.y * f_p2 * (r[1] - r[7] * p0_p2),
+-                                    sc3.z * f_p2 * (r[2] - r[8] * p0_p2), 0);
+-      jp[(tidx << 1) + 1] =
+-          make_float4(sc3.x * f_p2 * (r[3] - r[6] * p1_p2),
+-                      sc3.y * f_p2 * (r[4] - r[7] * p1_p2),
+-                      sc3.z * f_p2 * (r[5] - r[8] * p1_p2), 0);
+-    }
+-
+-  } else {
+-    if (scaling == false) {
+-      if (jc) {
+-        JACOBIAN_SET_JC_BEGIN
+-        jc[jc_pos] = make_float4(p0_p2 * jic, f_p2, 0, -f_p2 * p0_p2);
+-        jc[jc_pos + 1] = make_float4(-f_p2 * p0_p2 * y0,
+-                                     f_p2 * (z0 + x0 * p0_p2), -f_p2 * y0, 0);
+-        jc[jc_pos + 2] = make_float4(p1_p2 * jic, 0, f_p2, -f_p2 * p1_p2);
+-        jc[jc_pos + 3] = make_float4(-f_p2 * (z0 + y0 * p1_p2),
+-                                     f_p2 * x0 * p1_p2, f_p2 * x0, 0);
+-        JFRT_SET_JC_END
+-      }
+-      ////////////////////
+-      jp[(tidx << 1)] = make_float4(f_p2 * (r[0] - r[6] * p0_p2),
+-                                    f_p2 * (r[1] - r[7] * p0_p2),
+-                                    f_p2 * (r[2] - r[8] * p0_p2), 0);
+-      jp[(tidx << 1) + 1] = make_float4(f_p2 * (r[3] - r[6] * p1_p2),
+-                                        f_p2 * (r[4] - r[7] * p1_p2),
+-                                        f_p2 * (r[5] - r[8] * p1_p2), 0);
+-    } else {
+-      if (jc) {
+-        JACOBIAN_SET_JC_BEGIN
+-        float4 sc1 = tex1Dfetch(tex_jacobian_sj, proj.x);
+-        jc[jc_pos] = make_float4(p0_p2 * jic * sc1.x, f_p2 * sc1.y, 0,
+-                                 -f_p2 * p0_p2 * sc1.w);
+-        jc[jc_pos + 2] = make_float4(p1_p2 * jic * sc1.x, 0, f_p2 * sc1.z,
+-                                     -f_p2 * p1_p2 * sc1.w);
+-        float4 sc2 = tex1Dfetch(tex_jacobian_sj, proj.x + 1);
+-        jc[jc_pos + 1] = make_float4(-sc2.x * f_p2 * p0_p2 * y0,
+-                                     sc2.y * f_p2 * (z0 + x0 * p0_p2),
+-                                     -sc2.z * f_p2 * y0, 0);
+-        jc[jc_pos + 3] =
+-            make_float4(-sc2.x * f_p2 * (z0 + y0 * p1_p2),
+-                        sc2.y * f_p2 * x0 * p1_p2, sc2.z * f_p2 * x0, 0);
+-        JFRT_SET_JC_END
+-      }
+-
+-      float4 sc3 = tex1Dfetch(tex_jacobian_sj, proj.y + ptx);
+-      jp[(tidx << 1)] = make_float4(sc3.x * f_p2 * (r[0] - r[6] * p0_p2),
+-                                    sc3.y * f_p2 * (r[1] - r[7] * p0_p2),
+-                                    sc3.z * f_p2 * (r[2] - r[8] * p0_p2), 0);
+-      jp[(tidx << 1) + 1] =
+-          make_float4(sc3.x * f_p2 * (r[3] - r[6] * p1_p2),
+-                      sc3.y * f_p2 * (r[4] - r[7] * p1_p2),
+-                      sc3.z * f_p2 * (r[5] - r[8] * p1_p2), 0);
+-    }
+-  }
+-}
+-
+-/////////////////////////////////
+-void ProgramCU::ComputeJacobian(CuTexImage& camera, CuTexImage& point,
+-                                CuTexImage& jc, CuTexImage& jp,
+-                                CuTexImage& proj_map, CuTexImage& sj,
+-                                CuTexImage& meas, CuTexImage& cmlist,
+-                                bool intrinsic_fixed, int radial_distortion,
+-                                bool shuffle) {
+-  float jfc = intrinsic_fixed ? 0.0f : 1.0f;
+-  unsigned int len = proj_map.GetImgWidth();
+-  unsigned int bsize = JACOBIAN_FRT_KWIDTH;
+-  unsigned int nblock = (len + bsize - 1) / bsize;
+-  unsigned int bw, bh;
+-  GetBlockConfiguration(nblock, bw, bh);
+-  dim3 grid(bw, bh), block(bsize);
+-
+-  camera.BindTexture(tex_jacobian_cam);
+-  point.BindTexture(tex_jacobian_pts);
+-  proj_map.BindTexture(tex_jacobian_idx);
+-
+-  if (!jc.IsValid()) shuffle = false;
+-  if (shuffle) cmlist.BindTexture(tex_jacobian_shuffle);
+-  if (sj.IsValid()) sj.BindTexture(tex_jacobian_sj);
+-
+-  if (radial_distortion == -1) {
+-    meas.BindTexture(tex_jacobian_meas);
+-    if (sj.IsValid()) {
+-      if (shuffle)
+-        jacobian_frt_kernel<true, false, true, true><<<grid, block>>>(
+-            (float4*)jc.data(), (float4*)jp.data(), len,
+-            camera.GetImgWidth() * 2, bw * bsize, jfc);
+-      else
+-        jacobian_frt_kernel<true, false, true, false><<<grid, block>>>(
+-            (float4*)jc.data(), (float4*)jp.data(), len,
+-            camera.GetImgWidth() * 2, bw * bsize, jfc);
+-    } else {
+-      if (shuffle)
+-        jacobian_frt_kernel<true, false, false, true><<<grid, block>>>(
+-            (float4*)jc.data(), (float4*)jp.data(), len,
+-            camera.GetImgWidth() * 2, bw * bsize, jfc);
+-      else
+-        jacobian_frt_kernel<true, false, false, false><<<grid, block>>>(
+-            (float4*)jc.data(), (float4*)jp.data(), len,
+-            camera.GetImgWidth() * 2, bw * bsize, jfc);
+-    }
+-  } else if (radial_distortion) {
+-    if (sj.IsValid()) {
+-      if (shuffle)
+-        jacobian_frt_kernel<false, true, true, true><<<grid, block>>>(
+-            (float4*)jc.data(), (float4*)jp.data(), len,
+-            camera.GetImgWidth() * 2, bw * bsize, jfc);
+-      else
+-        jacobian_frt_kernel<false, true, true, false><<<grid, block>>>(
+-            (float4*)jc.data(), (float4*)jp.data(), len,
+-            camera.GetImgWidth() * 2, bw * bsize, jfc);
+-    } else {
+-      if (shuffle)
+-        jacobian_frt_kernel<false, true, false, true><<<grid, block>>>(
+-            (float4*)jc.data(), (float4*)jp.data(), len,
+-            camera.GetImgWidth() * 2, bw * bsize, jfc);
+-      else
+-        jacobian_frt_kernel<false, true, false, false><<<grid, block>>>(
+-            (float4*)jc.data(), (float4*)jp.data(), len,
+-            camera.GetImgWidth() * 2, bw * bsize, jfc);
+-    }
+-  } else {
+-    if (sj.IsValid()) {
+-      if (shuffle)
+-        jacobian_frt_kernel<false, false, true, true><<<grid, block>>>(
+-            (float4*)jc.data(), (float4*)jp.data(), len,
+-            camera.GetImgWidth() * 2, bw * bsize, jfc);
+-      else
+-        jacobian_frt_kernel<false, false, true, false><<<grid, block>>>(
+-            (float4*)jc.data(), (float4*)jp.data(), len,
+-            camera.GetImgWidth() * 2, bw * bsize, jfc);
+-    } else {
+-      if (shuffle)
+-        jacobian_frt_kernel<false, false, false, true><<<grid, block>>>(
+-            (float4*)jc.data(), (float4*)jp.data(), len,
+-            camera.GetImgWidth() * 2, bw * bsize, jfc);
+-      else
+-        jacobian_frt_kernel<false, false, false, false><<<grid, block>>>(
+-            (float4*)jc.data(), (float4*)jp.data(), len,
+-            camera.GetImgWidth() * 2, bw * bsize, jfc);
+-    }
+-  }
+-
+-  ProgramCU::CheckErrorCUDA("ComputeJacobian");
+-}
+-
+-texture<float4, 1, cudaReadModeElementType> tex_compact_cam;
+-__global__ void uncompress_frt_kernel(int ncam, float4* ucam) {
+-  int tidx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
+-  if (tidx >= ncam) return;
+-  int fetch_index = tidx << 1;
+-  int write_index = IMUL(tidx, 4);
+-  float4 temp1 = tex1Dfetch(tex_compact_cam, fetch_index);
+-  ucam[write_index] = temp1;
+-
+-  float4 temp2 = tex1Dfetch(tex_compact_cam, fetch_index + 1);
+-  float rx = temp2.x;
+-  float ry = temp2.y;
+-  float rz = temp2.z;
+-  float rx_rx = rx * rx;
+-  float ry_ry = ry * ry;
+-  float rz_rz = rz * rz;
+-  float aa = sqrt(rx_rx + ry_ry + rz_rz);
+-  float caa, saa;
+-  sincosf(aa, &saa, &caa);
+-  float ct = aa == 0.0 ? 0.5 : FDIV2(1.0 - caa, aa * aa);
+-  float st = aa == 0.0 ? 1 : FDIV2(saa, aa);
+-  float rz_st = rz * st;
+-  float rx_st = rx * st;
+-  float ry_st = ry * st;
+-  float ry_ry_ct = ry_ry * ct;
+-  float rx_rx_ct = rx_rx * ct;
+-  float rz_rz_ct = rz_rz * ct;
+-  float rx_ry_ct = rx * ry * ct;
+-  float rz_rx_ct = rz * rx * ct;
+-  float ry_rz_ct = ry * rz * ct;
+-
+-  ////////////////////////////////////////////////////////////
+-  ucam[write_index + 1] =
+-      make_float4((1.0 - (ry_ry_ct + rz_rz_ct)), (rx_ry_ct - rz_st),
+-                  (rz_rx_ct + ry_st), (rx_ry_ct + rz_st));
+-
+-  ucam[write_index + 2] =
+-      make_float4((1.0 - (rz_rz_ct + rx_rx_ct)), (ry_rz_ct - rx_st),
+-                  (rz_rx_ct - ry_st), (ry_rz_ct + rx_st));
+-
+-  ucam[write_index + 3] =
+-      make_float4((1.0 - (rx_rx_ct + ry_ry_ct)), temp2.w, 0, 0);
+-}
+-
+-void ProgramCU::UncompressCamera(int ncam, CuTexImage& camera,
+-                                 CuTexImage& result) {
+-  unsigned int len = ncam;
+-  unsigned int bsize = 64;
+-  unsigned int nblock = (len + bsize - 1) / bsize;
+-  dim3 grid(nblock);
+-  dim3 block(bsize);
+-  camera.BindTexture(tex_compact_cam);
+-  uncompress_frt_kernel<<<grid, block>>>(len, (float4*)result.data());
+-  CheckErrorCUDA("UncompressCamera");
+-}
+-
+-texture<float4, 1, cudaReadModeElementType> tex_uncompressed_cam;
+-
+-__global__ void compress_frt_kernel(int ncam, float4* zcam) {
+-  int tidx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
+-  if (tidx >= ncam) return;
+-  int fetch_index = tidx << 2;
+-  int write_index = tidx << 1;
+-  float4 temp1 = tex1Dfetch(tex_compact_cam, fetch_index);
+-  zcam[write_index] = temp1;
+-
+-  float4 r1 = tex1Dfetch(tex_compact_cam, fetch_index + 1);
+-  float4 r2 = tex1Dfetch(tex_compact_cam, fetch_index + 2);
+-  float4 r3 = tex1Dfetch(tex_compact_cam, fetch_index + 3);
+-
+-  float a = (r1.x + r2.x + r3.x - 1.0) / 2.0;
+-  if (a >= 1.0) {
+-    zcam[write_index + 1] = make_float4(0, 0, 0, 0);
+-  } else {
+-    float aa = acos(a), b = 0.5 * aa * rsqrt(1 - a * a);
+-    zcam[write_index + 1] = make_float4(b * (r2.w - r2.y), b * (r1.z - r2.z),
+-                                        b * (r1.w - r1.y), r3.y);
+-  }
+-}
+-
+-void ProgramCU::CompressCamera(int ncam, CuTexImage& camera0,
+-                               CuTexImage& result) {
+-  unsigned int len = ncam;
+-  unsigned int bsize = 64;
+-  unsigned int nblock = (len + bsize - 1) / bsize;
+-  dim3 grid(nblock), block(bsize);
+-  camera0.BindTexture(tex_uncompressed_cam);
+-  compress_frt_kernel<<<grid, block>>>(ncam, (float4*)result.data());
+-  CheckErrorCUDA("CompressCamera");
+-}
+-
+-__device__ inline void uncompress_rodrigues_rotation(float rx, float ry,
+-                                                     float rz, float* r) {
+-  float rx_rx = rx * rx;
+-  float ry_ry = ry * ry;
+-  float rz_rz = rz * rz;
+-  float aa = sqrt(rx_rx + ry_ry + rz_rz);
+-  float caa, saa;
+-  sincosf(aa, &saa, &caa);
+-  float ct = aa == 0.0 ? 0.5 : FDIV2(1.0 - caa, aa * aa);
+-  float st = aa == 0.0 ? 1 : FDIV2(saa, aa);
+-  float rz_st = rz * st;
+-  float rx_st = rx * st;
+-  float ry_st = ry * st;
+-  float ry_ry_ct = ry_ry * ct;
+-  float rx_rx_ct = rx_rx * ct;
+-  float rz_rz_ct = rz_rz * ct;
+-  float rx_ry_ct = rx * ry * ct;
+-  float rz_rx_ct = rz * rx * ct;
+-  float ry_rz_ct = ry * rz * ct;
+-  r[0] = (1.0 - (ry_ry_ct + rz_rz_ct));
+-  r[1] = (rx_ry_ct - rz_st);
+-  r[2] = (rz_rx_ct + ry_st);
+-  r[3] = (rx_ry_ct + rz_st);
+-  r[4] = (1.0 - (rz_rz_ct + rx_rx_ct));
+-  r[5] = (ry_rz_ct - rx_st);
+-  r[6] = (rz_rx_ct - ry_st);
+-  r[7] = (ry_rz_ct + rx_st);
+-  r[8] = (1.0 - (rx_rx_ct + ry_ry_ct));
+-}
+-
+-texture<float4, 1, cudaReadModeElementType> tex_update_cam;
+-texture<float4, 1, cudaReadModeElementType> tex_update_cam_delta;
+-
+-__global__ void update_camera_kernel(int ncam, float4* newcam) {
+-  int tidx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
+-  if (tidx >= ncam) return;
+-  int index0 = tidx << 2;
+-  int index1 = tidx << 1;
+-  {
+-    float4 c1 = tex1Dfetch(tex_update_cam, index0);
+-    float4 d1 = tex1Dfetch(tex_update_cam_delta, index1);
+-    float4 c2 = make_float4(max(c1.x + d1.x, 1e-10f), c1.y + d1.y, c1.z + d1.z,
+-                            c1.w + d1.w);
+-    newcam[index0] = c2;
+-  }
+-  {
+-    float r[9], dr[9];  //, nr[9];
+-    float4 r1 = tex1Dfetch(tex_update_cam, index0 + 1);
+-    r[0] = r1.x;
+-    r[1] = r1.y;
+-    r[2] = r1.z;
+-    r[3] = r1.w;
+-    float4 r2 = tex1Dfetch(tex_update_cam, index0 + 2);
+-    r[4] = r2.x;
+-    r[5] = r2.y;
+-    r[6] = r2.z;
+-    r[7] = r2.w;
+-    float4 r3 = tex1Dfetch(tex_update_cam, index0 + 3);
+-    r[8] = r3.x;
+-
+-    float4 dd = tex1Dfetch(tex_update_cam_delta, index1 + 1);
+-    uncompress_rodrigues_rotation(dd.x, dd.y, dd.z, dr);
+-
+-    ///////////////////////////////////////////////
+-    newcam[index0 + 1] =
+-        make_float4(dr[0] * r[0] + dr[1] * r[3] + dr[2] * r[6],
+-                    dr[0] * r[1] + dr[1] * r[4] + dr[2] * r[7],
+-                    dr[0] * r[2] + dr[1] * r[5] + dr[2] * r[8],
+-                    dr[3] * r[0] + dr[4] * r[3] + dr[5] * r[6]);
+-    newcam[index0 + 2] =
+-        make_float4(dr[3] * r[1] + dr[4] * r[4] + dr[5] * r[7],
+-                    dr[3] * r[2] + dr[4] * r[5] + dr[5] * r[8],
+-                    dr[6] * r[0] + dr[7] * r[3] + dr[8] * r[6],
+-                    dr[6] * r[1] + dr[7] * r[4] + dr[8] * r[7]);
+-    newcam[index0 + 3] = make_float4(dr[6] * r[2] + dr[7] * r[5] + dr[8] * r[8],
+-                                     r3.y + dd.w, r3.z, r3.w);
+-  }
+-}
+-
+-void ProgramCU::UpdateCameraPoint(int ncam, CuTexImage& camera,
+-                                  CuTexImage& point, CuTexImage& delta,
+-                                  CuTexImage& new_camera, CuTexImage& new_point,
+-                                  int mode) {
+-  if (mode != 2) {
+-    unsigned int len = ncam;
+-    unsigned int bsize = 64;
+-    unsigned int nblock = (len + bsize - 1) / bsize;
+-    dim3 grid(nblock), block(bsize);
+-    camera.BindTexture(tex_update_cam);
+-    delta.BindTexture(tex_update_cam_delta);
+-    update_camera_kernel<<<grid, block>>>(len, (float4*)new_camera.data());
+-    CheckErrorCUDA("UpdateCamera");
+-  }
+-
+-  // update the points
+-  if (mode != 1) {
+-    CuTexImage dp;
+-    dp.SetTexture(delta.data() + 8 * ncam, point.GetLength());
+-    ComputeSAXPY(1.0f, dp, point, new_point);
+-    CheckErrorCUDA("UpdatePoint");
+-  }
+-}
+-
+-#define PROJECTION_FRT_KWIDTH 64
+-
+-texture<float4, 1, cudaReadModeElementType> tex_projection_cam;
+-texture<int2, 1, cudaReadModeElementType> tex_projection_idx;
+-texture<float4, 1, cudaReadModeElementType> tex_projection_pts;
+-texture<float2, 1, cudaReadModeElementType> tex_projection_mea;
+-
+-// run 32/64/128 projections in a block
+-template <bool md, bool pd>
+-__global__ void projection_frt_kernel(int nproj, int rowsz, float2* pj) {
+-  ////////////////////////////////
+-  int tidx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
+-  if (tidx >= nproj) return;
+-  float f, m[3], t[3];  // r[9],
+-  __shared__ float rr_data[PROJECTION_FRT_KWIDTH * 9];
+-  float* r = rr_data + IMUL(9, threadIdx.x);
+-  int2 proj = tex1Dfetch(tex_projection_idx, tidx);
+-  int cpos = proj.x << 1;
+-  float4 ft = tex1Dfetch(tex_projection_cam, cpos);
+-  f = ft.x;
+-  t[0] = ft.y;
+-  t[1] = ft.z;
+-  t[2] = ft.w;
+-  float4 r1 = tex1Dfetch(tex_projection_cam, cpos + 1);
+-  r[0] = r1.x;
+-  r[1] = r1.y;
+-  r[2] = r1.z;
+-  r[3] = r1.w;
+-  float4 r2 = tex1Dfetch(tex_projection_cam, cpos + 2);
+-  r[4] = r2.x;
+-  r[5] = r2.y;
+-  r[6] = r2.z;
+-  r[7] = r2.w;
+-  float4 r3 = tex1Dfetch(tex_projection_cam, cpos + 3);
+-  r[8] = r3.x;
+-
+-  float4 temp = tex1Dfetch(tex_projection_pts, proj.y);
+-  m[0] = temp.x;
+-  m[1] = temp.y;
+-  m[2] = temp.z;
+-
+-  float p0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2] + t[0];
+-  float p1 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2] + t[1];
+-  float p2 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2] + t[2];
+-
+-  if (pd) {
+-    float rr = 1.0 + r3.y * (p0 * p0 + p1 * p1) / (p2 * p2);
+-    float f_p2 = FDIV2(f * rr, p2);
+-    float2 ms = tex1Dfetch(tex_projection_mea, tidx);
+-    pj[tidx] = make_float2(ms.x - p0 * f_p2, ms.y - p1 * f_p2);
+-  } else if (md) {
+-    float f_p2 = FDIV2(f, p2);
+-    float2 ms = tex1Dfetch(tex_projection_mea, tidx);
+-    float rd = 1.0 + r3.y * (ms.x * ms.x + ms.y * ms.y);
+-    pj[tidx] = make_float2(ms.x * rd - p0 * f_p2, ms.y * rd - p1 * f_p2);
+-  } else {
+-    float f_p2 = FDIV2(f, p2);
+-    float2 ms = tex1Dfetch(tex_projection_mea, tidx);
+-    pj[tidx] = make_float2(ms.x - p0 * f_p2, ms.y - p1 * f_p2);
+-  }
+-}
+-
+-void ProgramCU::ComputeProjection(CuTexImage& camera, CuTexImage& point,
+-                                  CuTexImage& meas, CuTexImage& proj_map,
+-                                  CuTexImage& proj, int radial) {
+-  unsigned int len = proj_map.GetImgWidth();
+-  unsigned int bsize = PROJECTION_FRT_KWIDTH;
+-  unsigned int nblock = (len + bsize - 1) / bsize;
+-  camera.BindTexture(tex_projection_cam);
+-  point.BindTexture(tex_projection_pts);
+-  proj_map.BindTexture(tex_projection_idx);
+-  unsigned int bw, bh;
+-  GetBlockConfiguration(nblock, bw, bh);
+-  dim3 grid(bw, bh), block(bsize);
+-  meas.BindTexture(tex_projection_mea);
+-  if (radial == -1)
+-    projection_frt_kernel<true, false><<<grid, block>>>(len, bw * bsize,
+-                                                        (float2*)proj.data());
+-  else if (radial)
+-    projection_frt_kernel<false, true><<<grid, block>>>(len, bw * bsize,
+-                                                        (float2*)proj.data());
+-  else
+-    projection_frt_kernel<false, false><<<grid, block>>>(len, bw * bsize,
+-                                                         (float2*)proj.data());
+-  CheckErrorCUDA("ComputeProjection");
+-}
+-
+-template <bool md, bool pd>
+-__global__ void projectionx_frt_kernel(int nproj, int rowsz, float2* pj) {
+-  ////////////////////////////////
+-  int tidx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
+-  if (tidx >= nproj) return;
+-  float f, m[3], t[3];  // r[9],
+-  __shared__ float rr_data[PROJECTION_FRT_KWIDTH * 9];
+-  float* r = rr_data + IMUL(9, threadIdx.x);
+-  int2 proj = tex1Dfetch(tex_projection_idx, tidx);
+-  int cpos = proj.x << 1;
+-  float4 ft = tex1Dfetch(tex_projection_cam, cpos);
+-  f = ft.x;
+-  t[0] = ft.y;
+-  t[1] = ft.z;
+-  t[2] = ft.w;
+-  float4 r1 = tex1Dfetch(tex_projection_cam, cpos + 1);
+-  r[0] = r1.x;
+-  r[1] = r1.y;
+-  r[2] = r1.z;
+-  r[3] = r1.w;
+-  float4 r2 = tex1Dfetch(tex_projection_cam, cpos + 2);
+-  r[4] = r2.x;
+-  r[5] = r2.y;
+-  r[6] = r2.z;
+-  r[7] = r2.w;
+-  float4 r3 = tex1Dfetch(tex_projection_cam, cpos + 3);
+-  r[8] = r3.x;
+-
+-  float4 temp = tex1Dfetch(tex_projection_pts, proj.y);
+-  m[0] = temp.x;
+-  m[1] = temp.y;
+-  m[2] = temp.z;
+-
+-  float p0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2] + t[0];
+-  float p1 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2] + t[1];
+-  float p2 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2] + t[2];
+-  if (pd) {
+-    float rr = 1.0 + r3.y * (p0 * p0 + p1 * p1) / (p2 * p2);
+-    float f_p2 = FDIV2(f, p2);
+-    float2 ms = tex1Dfetch(tex_projection_mea, tidx);
+-    pj[tidx] = make_float2(ms.x / rr - p0 * f_p2, ms.y / rr - p1 * f_p2);
+-  } else if (md) {
+-    float f_p2 = FDIV2(f, p2);
+-    float2 ms = tex1Dfetch(tex_projection_mea, tidx);
+-    float rd = 1.0 + r3.y * (ms.x * ms.x + ms.y * ms.y);
+-    pj[tidx] = make_float2(ms.x - p0 * f_p2 / rd, ms.y - p1 * f_p2 / rd);
+-  } else {
+-    float f_p2 = FDIV2(f, p2);
+-    float2 ms = tex1Dfetch(tex_projection_mea, tidx);
+-    pj[tidx] = make_float2(ms.x - p0 * f_p2, ms.y - p1 * f_p2);
+-  }
+-}
+-
+-void ProgramCU::ComputeProjectionX(CuTexImage& camera, CuTexImage& point,
+-                                   CuTexImage& meas, CuTexImage& proj_map,
+-                                   CuTexImage& proj, int radial) {
+-  unsigned int len = proj_map.GetImgWidth();
+-  unsigned int bsize = PROJECTION_FRT_KWIDTH;
+-  unsigned int nblock = (len + bsize - 1) / bsize;
+-  camera.BindTexture(tex_projection_cam);
+-  point.BindTexture(tex_projection_pts);
+-  proj_map.BindTexture(tex_projection_idx);
+-  unsigned int bw, bh;
+-  GetBlockConfiguration(nblock, bw, bh);
+-  dim3 grid(bw, bh), block(bsize);
+-  meas.BindTexture(tex_projection_mea);
+-  if (radial == -1)
+-    projectionx_frt_kernel<true, false><<<grid, block>>>(len, bw * bsize,
+-                                                         (float2*)proj.data());
+-  else if (radial)
+-    projectionx_frt_kernel<false, true><<<grid, block>>>(len, bw * bsize,
+-                                                         (float2*)proj.data());
+-  else
+-    projectionx_frt_kernel<false, false><<<grid, block>>>(len, bw * bsize,
+-                                                          (float2*)proj.data());
+-  CheckErrorCUDA("ComputeProjection");
+-}
+-
+-texture<float2, 1, cudaReadModeElementType> tex_jte_pe;
+-texture<float, 1, cudaReadModeElementType> tex_jte_pex;
+-texture<float4, 1, cudaReadModeElementType> tex_jte_jc;
+-texture<float4, 1, cudaReadModeElementType> tex_jte_jc2;
+-texture<int, 1, cudaReadModeElementType> tex_jte_cmp;
+-texture<int, 1, cudaReadModeElementType> tex_jte_cmt;
+-texture<float4, 1, cudaReadModeElementType> tex_jte_jc3;
+-texture<float4, 1, cudaReadModeElementType> tex_jte_jc4;
+-
+-__global__ void jte_cam_kernel(int num, float* jc, float* jte) {
+-  __shared__ float value[128];
+-
+-  // 8thread per camera
+-  int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
+-  if (col >= num) return;
+-
+-  int cam = col >> 4;  // 8 thread per camera
+-
+-  // read data range for this camera, 8 thread will do the same thing
+-  int idx1 = tex1Dfetch(tex_jte_cmp, cam) << 4;  // first camera
+-  int idx2 = tex1Dfetch(tex_jte_cmp, cam + 1) << 4;  // last camera + 1
+-
+-  ///////////////////////////////
+-  int offset = threadIdx.x & 0xf;  // which parameter of this camera
+-  int part = offset >= 8 ? 1 : 0;
+-  /////////////////////////////
+-
+-  float result = 0;
+-  // loop to read the index of the projection.
+-  // so to get the location to read the jacobian
+-  for (int i = idx1 + offset; i < idx2; i += 16) {
+-    float temp = jc[i];
+-    // every 8 thread will read the same position.
+-    int index = tex1Dfetch(tex_jte_cmt, i >> 4);
+-    float v = tex1Dfetch(tex_jte_pex, (index << 1) + part);
+-    //////////////////////
+-    result += temp * v;
+-  }
+-  value[threadIdx.x] = result;
+-  // write back
+-  if (offset < 8) jte[(cam << 3) + offset] = (result + value[threadIdx.x + 8]);
+-}
+-
+-template <int KH, int TEXN>
+-__global__ void jte_cam_vec_kernel(int num, float* jte) {
+-  __shared__ float value[KH * 128];
+-  int cam = blockIdx.x * KH + threadIdx.y;
+-  if (cam >= num) return;
+-
+-  // read data range for this camera
+-  // 8 thread will do the same thing
+-  int idx1 = tex1Dfetch(tex_jte_cmp, cam) << 2;  // first camera
+-  int idx2 = tex1Dfetch(tex_jte_cmp, cam + 1) << 2;  // last camera + 1
+-  int part = (threadIdx.x & 0x02) ? 1 : 0;
+-
+-  float rx = 0, ry = 0, rz = 0, rw = 0;
+-  // loop to read the index of the projection.
+-  // so to get the location to read the jacobian
+-  for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
+-    float4 temp;
+-    if (TEXN == 1) {
+-      temp = tex1Dfetch(tex_jte_jc, i);
+-    }
+-    if (TEXN == 2) {
+-      int texid = i >> 25;
+-      if (texid == 0)
+-        temp = tex1Dfetch(tex_jte_jc, i);
+-      else
+-        temp = tex1Dfetch(tex_jte_jc2, (i & 0x1ffffff));
+-    }
+-    if (TEXN == 4) {
+-      int index = tex1Dfetch(tex_jte_cmt, i >> 2);
+-      int iii = (index << 2) + (i & 0x3);
+-      int texid = iii >> 25;
+-      /////////////////////////////////
+-      if (texid == 0)
+-        temp = tex1Dfetch(tex_jte_jc, iii);
+-      else if (texid == 1)
+-        temp = tex1Dfetch(tex_jte_jc2, (iii & 0x1ffffff));
+-      else if (texid == 2)
+-        temp = tex1Dfetch(tex_jte_jc3, (iii & 0x1ffffff));
+-      else
+-        temp = tex1Dfetch(tex_jte_jc4, (iii & 0x1ffffff));
+-    }
+-    int index = tex1Dfetch(tex_jte_cmt, i >> 2);
+-    float vv = tex1Dfetch(tex_jte_pex, (index << 1) + part);
+-    rx += temp.x * vv;
+-    ry += temp.y * vv;
+-    rz += temp.z * vv;
+-    rw += temp.w * vv;
+-  }
+-  ////////////////////////////////////
+-  int widx = (threadIdx.y << 7) + (threadIdx.x << 2);
+-  ///////////////////////////////////
+-  // write back
+-  value[widx] = rx;
+-  value[widx + 1] = ry;
+-  value[widx + 2] = rz;
+-  value[widx + 3] = rw;
+-  ////////////////////////////////////
+-  int ridx = (threadIdx.y << 7) + threadIdx.x;
+-  value[ridx] = ((value[ridx] + value[ridx + 32]) +
+-                 (value[ridx + 64] + value[ridx + 96]));
+-  if (threadIdx.x < 16) value[ridx] += value[ridx + 16];
+-  if (threadIdx.x < 8)
+-    jte[(cam << 3) + threadIdx.x] = value[ridx] + value[ridx + 8];
+-}
+-
+-template <int KH, bool JT>
+-__global__ void jte_cam_vec32_kernel(int num, float* jc, float* jte) {
+-  __shared__ float value[KH * 32];
+-  int cam = blockIdx.x * KH + threadIdx.y;
+-  if (cam >= num) return;
+-  float sum = 0;
+-  int rowpos = (threadIdx.y << 5);
+-  int index = threadIdx.x + rowpos;
+-  int xypart = (threadIdx.x & 0x08) ? 1 : 0;
+-  int part2 = threadIdx.x & 0xf;
+-  // read data range for this camera
+-  // 8 thread will do the same thing
+-  int idx1 = tex1Dfetch(tex_jte_cmp, cam) << 4;  // first camera
+-  int idx2 = tex1Dfetch(tex_jte_cmp, cam + 1) << 4;  // last camera + 1
+-
+-  // loop to read the index of the projection.
+-  // so to get the location to read the jacobian
+-  for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
+-    int index = tex1Dfetch(tex_jte_cmt, i >> 4);
+-    float temp;
+-    if (JT)
+-      temp = jc[i];
+-    else
+-      temp = jc[(index << 4) + part2];
+-
+-    float v = tex1Dfetch(tex_jte_pex, (index << 1) + xypart);
+-    sum += temp * v;
+-  }
+-  value[index] = sum;
+-
+-  if (threadIdx.x < 16) value[index] += value[index + 16];
+-  if (threadIdx.x < 8)
+-    jte[(cam << 3) + threadIdx.x] = value[index] + value[index + 8];
+-}
+-
+-/////////////////////////////////////////////////////////////
+-texture<float4, 1, cudaReadModeElementType> tex_jte_jp;
+-texture<int, 1, cudaReadModeElementType> tex_jte_pmp;
+-texture<float4, 1, cudaReadModeElementType> tex_jte_jp2;
+-
+-__global__ void jte_point_kernel(int num, float4* jte) {
+-  ////////////////////////////
+-  int index = blockIdx.x * blockDim.x + threadIdx.x;
+-  if (index >= num) return;
+-
+-  int idx1 = tex1Dfetch(tex_jte_pmp, index);  // first camera
+-  int idx2 = tex1Dfetch(tex_jte_pmp, index + 1);  // last camera + 1
+-  float4 result = make_float4(0, 0, 0, 0);
+-  for (int i = idx1; i < idx2; ++i) {
+-    // error vector
+-    float2 ev = tex1Dfetch(tex_jte_pe, i);
+-
+-    float4 j1 = tex1Dfetch(tex_jte_jp, i << 1);
+-    result.x += j1.x * ev.x;
+-    result.y += j1.y * ev.x;
+-    result.z += j1.z * ev.x;
+-
+-    float4 j2 = tex1Dfetch(tex_jte_jp, 1 + (i << 1));
+-    result.x += j2.x * ev.y;
+-    result.y += j2.y * ev.y;
+-    result.z += j2.z * ev.y;
+-  }
+-  jte[index] = result;
+-}
+-
+-////////////////////
+-// faster but not always more accurate
+-//#define JTE_POINT_VEC2
+-
+-template <int KH, int TEXN>
+-__global__ void jte_point_vec_kernel(int num, int rowsz, float* jte) {
+-  ////////////////////////////
+-  __shared__ float value[KH * 128];
+-  int index = blockIdx.x * KH + threadIdx.y + blockIdx.y * rowsz;
+-  if (index >= num) return;
+-#ifdef JTE_POINT_VEC2
+-  int idx1 = tex1Dfetch(tex_jte_pmp, index);  // first
+-  int idx2 = tex1Dfetch(tex_jte_pmp, index + 1);  // last  + 1
+-#else
+-  int idx1 = tex1Dfetch(tex_jte_pmp, index) << 1;  // first
+-  int idx2 = tex1Dfetch(tex_jte_pmp, index + 1) << 1;  // last  + 1
+-#endif
+-  float rx = 0, ry = 0, rz = 0;
+-  for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
+-    if (TEXN == 2 && i >> 25) {
+-#ifdef JTE_POINT_VEC2
+-
+-      float2 vv = tex1Dfetch(tex_jte_pe, i);
+-      float4 jp1 = tex1Dfetch(tex_jte_jp, ((i & 0x1ffffff) << 1));
+-      float4 jp2 = tex1Dfetch(tex_jte_jp, ((i & 0x1ffffff) << 1) + 1);
+-      rx += (jp1.x * vv.x + jp2.x * vv.y);
+-      ry += (jp1.y * vv.x + jp2.y * vv.y);
+-      rz += (jp1.z * vv.x + jp2.z * vv.y);
+-#else
+-      float vv = tex1Dfetch(tex_jte_pex, i);
+-      float4 jpi = tex1Dfetch(tex_jte_jp2, i & 0x1ffffff);
+-      rx += jpi.x * vv;
+-      ry += jpi.y * vv;
+-      rz += jpi.z * vv;
+-#endif
+-    } else {
+-#ifdef JTE_POINT_VEC2
+-      float2 vv = tex1Dfetch(tex_jte_pe, i);
+-      float4 jp1 = tex1Dfetch(tex_jte_jp, (i << 1));
+-      float4 jp2 = tex1Dfetch(tex_jte_jp, (i << 1) + 1);
+-      rx += (jp1.x * vv.x + jp2.x * vv.y);
+-      ry += (jp1.y * vv.x + jp2.y * vv.y);
+-      rz += (jp1.z * vv.x + jp2.z * vv.y);
+-#else
+-      float vv = tex1Dfetch(tex_jte_pex, i);
+-      float4 jpi = tex1Dfetch(tex_jte_jp, i);
+-      rx += jpi.x * vv;
+-      ry += jpi.y * vv;
+-      rz += jpi.z * vv;
+-#endif
+-    }
+-  }
+-
+-  int rowp = threadIdx.y << 7;
+-  int loc = (threadIdx.x << 2) + rowp;
+-  value[loc] = rx;
+-  value[loc + 1] = ry;
+-  value[loc + 2] = rz;
+-  value[loc + 3] = 0;
+-
+-  int ridx = threadIdx.x + rowp;
+-  value[ridx] = ((value[ridx] + value[ridx + 32]) +
+-                 (value[ridx + 64] + value[ridx + 96]));
+-  if (threadIdx.x < 16) value[ridx] += value[ridx + 16];
+-  if (threadIdx.x < 8) value[ridx] += value[ridx + 8];
+-  if (threadIdx.x < 4)
+-    jte[(index << 2) + threadIdx.x] = value[ridx] + value[ridx + 4];
+-}
+-
+-#define JTE_CAMERA_VEC
+-#define JTE_POINT_VEC
+-
+-void ProgramCU::ComputeJtE(CuTexImage& pe, CuTexImage& jc, CuTexImage& cmap,
+-                           CuTexImage& cmlist, CuTexImage& jp, CuTexImage& pmap,
+-                           CuTexImage& jte, bool jc_transpose, int mode) {
+-  //////////////////////////////////////////////////////////
+-  int ncam = int(cmap.GetImgWidth() - 1);  // how many cameras
+-  size_t szjc = jc.GetDataSize();
+-
+-  //////////////////////////////
+-  cmap.BindTexture(tex_jte_cmp);
+-  cmlist.BindTexture(tex_jte_cmt);
+-#ifdef JTE_CAMERA_VEC2
+-  pe.BindTexture(tex_jte_pex);
+-  const unsigned int bheight = 2;
+-  dim3 block1(32, bheight), grid1((ncam + bheight - 1) / bheight);
+-  if (mode == 2) {
+-  } else if (jc_transpose)
+-    jte_cam_vec32_kernel<bheight, true><<<grid1, block1>>>(ncam, jc.data(),
+-                                                           jte.data());
+-  else
+-    jte_cam_vec32_kernel<bheight, false><<<grid1, block1>>>(ncam, jc.data(),
+-                                                            jte.data());
+-
+-#elif defined(JTE_CAMERA_VEC)
+-  pe.BindTexture(tex_jte_pex);
+-  const unsigned int bheight = 2;
+-  unsigned int len1 = ncam * 32;
+-  unsigned int bsize1 = 32 * bheight;
+-  unsigned int nblock1 = (len1 + bsize1 - 1) / bsize1;
+-  dim3 grid1(nblock1);
+-  dim3 block1(32, bheight);
+-  if (mode == 2) {
+-    // skip camera
+-  } else if (szjc > 2 * MAX_TEXSIZE || !jc_transpose) {
+-    if (jc_transpose)
+-      jte_cam_vec32_kernel<bheight, true><<<grid1, block1>>>(ncam, jc.data(),
+-                                                             jte.data());
+-    else
+-      jte_cam_vec32_kernel<bheight, false><<<grid1, block1>>>(ncam, jc.data(),
+-                                                              jte.data());
+-  } else if (szjc > MAX_TEXSIZE) {
+-    jc.BindTexture2(tex_jte_jc, tex_jte_jc2);
+-    jte_cam_vec_kernel<bheight, 2><<<grid1, block1>>>(ncam, jte.data());
+-  } else {
+-    jc.BindTexture(tex_jte_jc);
+-    jte_cam_vec_kernel<bheight, 1><<<grid1, block1>>>(ncam, jte.data());
+-  }
+-#else
+-  pe.BindTexture(tex_jte_pex);
+-  unsigned int len1 = ncam * 16;
+-  unsigned int bsize1 = len1 > 32 * 128 ? 128 : (len1 > 32 * 64 ? 64 : 32);
+-  unsigned int nblock1 = (len1 + bsize1 - 1) / bsize1;
+-  dim3 grid1(nblock1), block1(bsize1);
+-  jte_cam_kernel<<<grid1, block1>>>(len1, jc.data(), jte.data());
+-#endif
+-  CheckErrorCUDA("ComputeJtE<Camera>");
+-
+-  ////////////////////////////////////////////
+-  pmap.BindTexture(tex_jte_pmp);
+-  unsigned int npoint = (pmap.GetImgWidth() - 1);
+-#ifndef JTE_POINT_VEC
+-  size_t len2 = npoint;
+-  unsigned int bsize2 = 64;
+-  unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2;
+-  dim3 grid2(nblock2), block2(bsize2);
+-  pe.BindTexture(tex_jte_pe);
+-  jp.BindTexture(tex_jte_jp);
+-  jte_point_kernel<<<grid2, block2>>>(len2, ((float4*)jte.data()) + 2 * ncam);
+-#else
+-
+-#ifdef JTE_POINT_VEC2
+-  pe.BindTexture(tex_jte_pe);
+-#else
+-  pe.BindTexture(tex_jte_pex);
+-#endif
+-  const unsigned int bheight2 = 2;
+-  unsigned int bsize2 = 32;
+-  unsigned int nblock2 = (unsigned int)((npoint + bheight2 - 1) / bheight2);
+-  unsigned int offsetv = 8 * ncam;
+-  unsigned int bw, bh;
+-  GetBlockConfiguration(nblock2, bw, bh);
+-  dim3 grid2(bw, bh), block2(bsize2, bheight2);
+-  if (mode == 1) {
+-    // skip point
+-  } else if (jp.GetDataSize() > MAX_TEXSIZE) {
+-    jp.BindTexture2(tex_jte_jp, tex_jte_jp2);
+-    jte_point_vec_kernel<bheight2, 2><<<grid2, block2>>>(
+-        npoint, bw * bheight2, ((float*)jte.data()) + offsetv);
+-  } else {
+-    jp.BindTexture(tex_jte_jp);
+-    jte_point_vec_kernel<bheight2, 1><<<grid2, block2>>>(
+-        npoint, bw * bheight2, ((float*)jte.data()) + offsetv);
+-  }
+-#endif
+-  CheckErrorCUDA("ComputeJtE<Point>");
+-}
+-
+-texture<int, 1, cudaReadModeElementType> tex_jtjd_cmp;
+-texture<int, 1, cudaReadModeElementType> tex_jtjd_cmlist;
+-
+-template <int VN, int KH, bool JT>
+-__global__ void jtjd_cam_vec32_kernel(int num, int add_existing_dq, float* jc,
+-                                      float* jtjd, float* jtjdi) {
+-  __shared__ float value[KH * 32];
+-
+-  // 8thread per camera
+-  int cam = blockIdx.x * KH + threadIdx.y;
+-  int part = threadIdx.x & 0x7;  // which parameter of this camera
+-  int part2 = threadIdx.x & 0xf;
+-  int campos = threadIdx.y << 5;
+-  int index = threadIdx.x + campos;
+-  float sum = 0;
+-  if (cam < num && part < VN) {
+-    // read data range for this camera
+-    // 8 thread will do the same thing
+-    int idx1 = tex1Dfetch(tex_jtjd_cmp, cam) << 4;  // first camera
+-    int idx2 = tex1Dfetch(tex_jtjd_cmp, cam + 1) << 4;  // last camera + 1
+-
+-    // loop to read the index of the projection.
+-    // so to get the location to read the jacobian
+-    for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
+-      if (JT) {
+-        float temp = jc[i];
+-        sum += temp * temp;
+-      } else {
+-        int ii = tex1Dfetch(tex_jtjd_cmlist, i >> 4) << 4;
+-        float temp = jc[ii + part2];
+-        sum += temp * temp;
+-      }
+-    }
+-  }
+-  __syncthreads();
+-
+-  if (cam >= num) return;
+-  // save all the results?
+-  value[index] = sum;
+-  if (threadIdx.x < 16) value[index] += value[index + 16];
+-  if (threadIdx.x < 8)
+-
+-    // write back
+-    if (threadIdx.x < 8) {
+-      float temp = value[index] + value[index + 8];
+-      int wpos = threadIdx.x + (cam << 3);
+-      if (add_existing_dq) temp += jtjd[wpos];
+-      jtjd[wpos] = temp;
+-      jtjdi[wpos] = temp == 0 ? 0 : 1 / (temp);
+-    }
+-}
+-
+-texture<float4, 1, cudaReadModeElementType> tex_jtjd_jp;
+-texture<int, 1, cudaReadModeElementType> tex_jtjd_pmp;
+-texture<float4, 1, cudaReadModeElementType> tex_jtjd_jp2;
+-
+-#define JTJD_POINT_KWIDTH 64
+-
+-template <int TEXN>
+-__global__ void jtjd_point_kernel(int num, int rowsz, float4* jtjd,
+-                                  float4* jtjdi) {
+-  ////////////////////////////
+-  int index = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz;
+-  if (index >= num) return;
+-
+-  int idx1 = tex1Dfetch(tex_jtjd_pmp, index);  // first camera
+-  int idx2 = tex1Dfetch(tex_jtjd_pmp, index + 1);  // last camera + 1
+-  float rx = 0, ry = 0, rz = 0;
+-  for (int i = idx1; i < idx2; ++i) {
+-    if (TEXN == 2 && i > 0xffffff) {
+-      float4 j1 = tex1Dfetch(tex_jtjd_jp2, (i & 0xffffff) << 1);
+-      rx += j1.x * j1.x;
+-      ry += j1.y * j1.y;
+-      rz += j1.z * j1.z;
+-
+-      float4 j2 = tex1Dfetch(tex_jtjd_jp2, 1 + ((i & 0xffffff) << 1));
+-      rx += j2.x * j2.x;
+-      ry += j2.y * j2.y;
+-      rz += j2.z * j2.z;
+-    } else {
+-      float4 j1 = tex1Dfetch(tex_jtjd_jp, i << 1);
+-      rx += j1.x * j1.x;
+-      ry += j1.y * j1.y;
+-      rz += j1.z * j1.z;
+-
+-      float4 j2 = tex1Dfetch(tex_jtjd_jp, 1 + (i << 1));
+-      rx += j2.x * j2.x;
+-      ry += j2.y * j2.y;
+-      rz += j2.z * j2.z;
+-    }
+-  }
+-
+-  if (jtjd) jtjd[index] = make_float4(rx, ry, rz, 0.0f);
+-  jtjdi[index] = make_float4(1.0f / rx, 1.0f / ry, 1.0f / rz, 0.0f);
+-}
+-
+-void ProgramCU::ComputeDiagonal(CuTexImage& jc, CuTexImage& cmap,
+-                                CuTexImage& jp, CuTexImage& pmap,
+-                                CuTexImage& cmlist, CuTexImage& jtjd,
+-                                CuTexImage& jtjdi, bool jc_transpose,
+-                                int radial, bool add_existing_diagc) {
+-  //////////////////////////////////////////////////////////
+-  size_t szjc = jc.GetDataSize();
+-  unsigned int ncam = (cmap.GetImgWidth() - 1);  // how many cameras
+-
+-  const unsigned int bheight = 2;
+-  dim3 block1x(32, bheight), grid1x((ncam + bheight - 1) / bheight);
+-  cmap.BindTexture(tex_jtjd_cmp);
+-  if (jc_transpose) {
+-    if (radial)
+-      jtjd_cam_vec32_kernel<8, bheight, true><<<grid1x, block1x>>>(
+-          ncam, add_existing_diagc, jc.data(), jtjd.data(), jtjdi.data());
+-    else
+-      jtjd_cam_vec32_kernel<7, bheight, true><<<grid1x, block1x>>>(
+-          ncam, add_existing_diagc, jc.data(), jtjd.data(), jtjdi.data());
+-  } else {
+-    cmlist.BindTexture(tex_jtjd_cmlist);
+-    if (radial)
+-      jtjd_cam_vec32_kernel<8, bheight, false><<<grid1x, block1x>>>(
+-          ncam, add_existing_diagc, jc.data(), jtjd.data(), jtjdi.data());
+-    else
+-      jtjd_cam_vec32_kernel<7, bheight, false><<<grid1x, block1x>>>(
+-          ncam, add_existing_diagc, jc.data(), jtjd.data(), jtjdi.data());
+-  }
+-  CheckErrorCUDA("ComputeDiagonal<Camera>");
+-
+-  ////////////////////////////////////////////
+-  unsigned int npoint = (pmap.GetImgWidth() - 1);
+-  unsigned int len2 = npoint;
+-  unsigned int bsize2 = JTJD_POINT_KWIDTH;
+-  unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2;
+-  unsigned int bw, bh;
+-  GetBlockConfiguration(nblock2, bw, bh);
+-  dim3 grid2(bw, bh), block2(bsize2);
+-  pmap.BindTexture(tex_jtjd_pmp);
+-
+-  if (jp.GetDataSize() > MAX_TEXSIZE) {
+-    jp.BindTexture2(tex_jtjd_jp, tex_jtjd_jp2);
+-    jtjd_point_kernel<2><<<grid2, block2>>>(len2, (bw * bsize2),
+-                                            ((float4*)jtjd.data()) + 2 * ncam,
+-                                            ((float4*)jtjdi.data()) + 2 * ncam);
+-  } else {
+-    jp.BindTexture(tex_jtjd_jp);
+-    jtjd_point_kernel<1><<<grid2, block2>>>(len2, (bw * bsize2),
+-                                            ((float4*)jtjd.data()) + 2 * ncam,
+-                                            ((float4*)jtjdi.data()) + 2 * ncam);
+-  }
+-  CheckErrorCUDA("ComputeDiagonal<Point>");
+-}
+-
+-// for each
+-template <bool SJ>
+-__global__ void jtjd_cam_q_kernel(int num, int rowsz, float* qw, float4* diag) {
+-  int bindex = IMUL(blockIdx.x, blockDim.x) + rowsz * blockIdx.y;
+-  int index = bindex + threadIdx.x;
+-  if (index >= num) return;
+-  int tid = index & 0x1;
+-  float w = qw[index], ws = w * w * 2.0f;
+-  if (SJ) {
+-    float4 sj = tex1Dfetch(tex_jacobian_sj, index);
+-    float4 dj = tid == 0 ? make_float4(sj.x * sj.x * ws, 0, 0, 0)
+-                         : make_float4(0, 0, 0, sj.w * sj.w * ws);
+-    diag[index] = dj;
+-  } else {
+-    float4 dj = tid == 0 ? make_float4(ws, 0, 0, 0) : make_float4(0, 0, 0, ws);
+-    diag[index] = dj;
+-  }
+-}
+-
+-void ProgramCU::ComputeDiagonalQ(CuTexImage& qlistw, CuTexImage& sj,
+-                                 CuTexImage& diag) {
+-  unsigned int bsize = 32;
+-  unsigned int len = qlistw.GetImgWidth() * 2;
+-  unsigned int nblock = (len + bsize - 1) / bsize;
+-  unsigned int bw, bh;
+-  GetBlockConfiguration(nblock, bw, bh);
+-  dim3 grid(bw, bh), block(bsize);
+-  if (sj.IsValid()) {
+-    sj.BindTexture(tex_jacobian_sj);
+-    jtjd_cam_q_kernel<true><<<grid, block>>>(len, (bw * bsize), qlistw.data(),
+-                                             (float4*)diag.data());
+-  } else {
+-    jtjd_cam_q_kernel<false><<<grid, block>>>(len, (bw * bsize), qlistw.data(),
+-                                              (float4*)diag.data());
+-  }
+-  CheckErrorCUDA("ComputeDiagonalQ");
+-}
+-
+-template <int VN, int KH, bool JT>
+-__global__ void jtjd_cam_block_vec32_kernel(int num, float lambda1,
+-                                            float lambda2, float* jc,
+-                                            float* diag, float* blocks,
+-                                            bool add_existing_diagc) {
+-  __shared__ float value[KH * 32 * VN];
+-
+-  // 8thread per camera
+-  int cam = blockIdx.x * KH + threadIdx.y;
+-  int part = threadIdx.x & 0x7;  // which parameter of this camera
+-  int part2 = threadIdx.x & 0xf;
+-  int index = threadIdx.x + (threadIdx.y << 5);
+-  float row[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+-  if (cam < num) {
+-    int rowpos = index - part;
+-    // read data range for this camera
+-    // 8 thread will do the same thing
+-    int idx1 = tex1Dfetch(tex_jtjd_cmp, cam) << 4;  // first camera
+-    int idx2 = tex1Dfetch(tex_jtjd_cmp, cam + 1) << 4;  // last camera + 1
+-
+-    // loop to read the index of the projection.
+-    // so to get the location to read the jacobian
+-    for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
+-      if (JT) {
+-        float temp = jc[i];
+-        value[index] = temp;
+-        for (int j = 0; j < VN; ++j) row[j] += (temp * value[rowpos + j]);
+-      } else {
+-        int ii = tex1Dfetch(tex_jtjd_cmlist, i >> 4) << 4;
+-        float temp = jc[ii + part2];
+-        value[index] = temp;
+-        for (int j = 0; j < VN; ++j) row[j] += (temp * value[rowpos + j]);
+-      }
+-    }
+-  }
+-  __syncthreads();
+-
+-  if (cam >= num) return;
+-  // save all the results?
+-  for (int i = 0; i < VN; ++i) value[index * VN + i] = row[i];
+-  int campos = threadIdx.y * (32 * VN);
+-  for (int i = threadIdx.x; i < (VN * 16); i += 32)
+-    value[campos + i] += value[campos + i + (16 * VN)];
+-  for (int i = threadIdx.x; i < (VN * 8); i += 32)
+-    value[campos + i] += value[campos + i + (8 * VN)];
+-
+-  if (VN == 7) {
+-    bool zero = (part >= VN);
+-
+-    // write back
+-    if (threadIdx.x < 8) {
+-      float* dp = value + campos + threadIdx.x * (VN + 1);
+-      float temp = zero ? 0 : dp[0];
+-      int didx = threadIdx.x + (cam << 3);
+-      if (add_existing_diagc) temp += diag[didx];
+-      diag[didx] = temp;
+-      dp[0] = lambda1 + lambda2 * temp;
+-    }
+-    int wpos = cam * (8 * VN) + threadIdx.x;
+-    int rpos = campos + threadIdx.x - (threadIdx.x >> 3);
+-    blocks[wpos] = zero ? 0 : value[rpos];
+-    if (threadIdx.x < (VN * 8 - 32))
+-      blocks[wpos + 32] = zero ? 0 : value[rpos + 28];
+-  } else {
+-    // write back
+-    if (threadIdx.x < 8) {
+-      float* dp = value + campos + threadIdx.x * (VN + 1);
+-      float temp = dp[0];
+-      int didx = threadIdx.x + (cam << 3);
+-      if (add_existing_diagc) temp += diag[didx];
+-      diag[didx] = temp;
+-      dp[0] = lambda1 + lambda2 * temp;  // max(, 1e-6) ;
+-    }
+-    int wpos = cam * (8 * VN) + threadIdx.x;
+-    int rpos = campos + threadIdx.x;
+-    blocks[wpos] = value[rpos];
+-    blocks[wpos + 32] = value[rpos + 32];
+-  }
+-}
+-
+-#define JTJD_POINT_BLOCK_KWIDTH 64
+-
+-template <int TEXN>
+-__global__ void jtjd_point_block_kernel(int num, int rowsz, float lambda1,
+-                                        float lambda2, float4* diag,
+-                                        float4* blocks) {
+-  ////////////////////////////
+-  int index = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz;
+-  if (index >= num) return;
+-
+-  int idx1 = tex1Dfetch(tex_jtjd_pmp, index);  // first camera
+-  int idx2 = tex1Dfetch(tex_jtjd_pmp, index + 1);  // last camera + 1
+-
+-  float M00 = 0, M01 = 0, M02 = 0, M11 = 0, M12 = 0, M22 = 0;
+-  for (int i = idx1; i < idx2; ++i) {
+-    if (TEXN == 2 && i > 0xffffff) {
+-      float4 j1 = tex1Dfetch(tex_jtjd_jp2, (i & 0xffffff) << 1);
+-      M00 += j1.x * j1.x;
+-      M01 += j1.x * j1.y;
+-      M02 += j1.x * j1.z;
+-      M11 += j1.y * j1.y;
+-      M12 += j1.y * j1.z;
+-      M22 += j1.z * j1.z;
+-
+-      float4 j2 = tex1Dfetch(tex_jtjd_jp2, 1 + ((i & 0xffffff) << 1));
+-      M00 += j2.x * j2.x;
+-      M01 += j2.x * j2.y;
+-      M02 += j2.x * j2.z;
+-      M11 += j2.y * j2.y;
+-      M12 += j2.y * j2.z;
+-      M22 += j2.z * j2.z;
+-    } else {
+-      float4 j1 = tex1Dfetch(tex_jtjd_jp, i << 1);
+-      M00 += j1.x * j1.x;
+-      M01 += j1.x * j1.y;
+-      M02 += j1.x * j1.z;
+-      M11 += j1.y * j1.y;
+-      M12 += j1.y * j1.z;
+-      M22 += j1.z * j1.z;
+-
+-      float4 j2 = tex1Dfetch(tex_jtjd_jp, 1 + (i << 1));
+-      M00 += j2.x * j2.x;
+-      M01 += j2.x * j2.y;
+-      M02 += j2.x * j2.z;
+-      M11 += j2.y * j2.y;
+-      M12 += j2.y * j2.z;
+-      M22 += j2.z * j2.z;
+-    }
+-  }
+-
+-  diag[index] = make_float4(M00, M11, M22, 0);
+-
+-  M00 = lambda2 * M00 + lambda1;
+-  M11 = lambda2 * M11 + lambda1;
+-  M22 = lambda2 * M22 + lambda1;
+-
+-  // invert the 3x3 matrix.
+-  float det = (M00 * M11 - M01 * M01) * M22 + 2.0 * M01 * M12 * M02 -
+-              M02 * M02 * M11 - M12 * M12 * M00;
+-  if (det >= FLT_MAX || det <= FLT_MIN * 2.0f) {
+-    int write_pos = index * 3;
+-    blocks[write_pos] = make_float4(0, 0, 0, 0);
+-    blocks[write_pos + 1] = make_float4(0, 0, 0, 0);
+-    blocks[write_pos + 2] = make_float4(0, 0, 0, 0);
+-  } else {
+-    float m00 = (M11 * M22 - M12 * M12) / det;
+-    float m01 = -(M01 * M22 - M12 * M02) / det;
+-    float m02 = (M01 * M12 - M02 * M11) / det;
+-    int write_pos = index * 3;
+-    blocks[write_pos] = make_float4(m00, m01, m02, 0);
+-
+-    float m11 = (M00 * M22 - M02 * M02) / det;
+-    float m12 = -(M00 * M12 - M01 * M02) / det;
+-    blocks[write_pos + 1] = make_float4(m01, m11, m12, 0);
+-
+-    float m22 = (M00 * M11 - M01 * M01) / det;
+-    blocks[write_pos + 2] = make_float4(m02, m12, m22, 0);
+-  }
+-}
+-
+-#define JTJD_BLOCK_CAM_INVERT_KWIDTH 64
+-template <int VN>
+-__global__ void jtjd_cam_block_invert_kernel(int num, float4* blocks) {
+-  // N /  8 cameras...each have 64 floats,,,, N * 8 float
+-  // each will read 8 float......
+-  __shared__ float value[JTJD_BLOCK_CAM_INVERT_KWIDTH * VN];
+-  __shared__ bool invalid[JTJD_BLOCK_CAM_INVERT_KWIDTH / 8];
+-  //////////////////////////////////////////////
+-
+-  int bindex = IMUL(blockIdx.x, blockDim.x);
+-  int index = bindex + threadIdx.x;
+-  int block_read_pos = IMUL(bindex, VN);
+-  for (int i = 0; i < JTJD_BLOCK_CAM_INVERT_KWIDTH * VN;
+-       i += JTJD_BLOCK_CAM_INVERT_KWIDTH)
+-    value[threadIdx.x + i] = ((float*)blocks)[block_read_pos + threadIdx.x + i];
+-  __syncthreads();
+-  const int cam_id = threadIdx.x >> 3;
+-  const int cam_pos = IMUL(cam_id, VN * 8);
+-  const int col = threadIdx.x & 0x7, rowj_pos = col << 3;
+-  ;  //
+-
+-  float* a = value + cam_pos;
+-  for (int i = 0; i < VN; ++i) {
+-    int rowi_pos = i << 3, dpos = i + rowi_pos;
+-    if (col == i && a[dpos] > 0) a[dpos] = rsqrt(a[dpos]);
+-    __syncthreads();
+-    float diag = a[dpos];
+-    if (diag == 0 || col >= VN) continue;
+-    if (col < i) {
+-      a[rowi_pos + col] = 0;
+-    } else if (col > i) {
+-      float aij = a[rowi_pos + col] * diag;
+-      a[rowi_pos + col] = aij;
+-      for (int k = col; k < VN; ++k) a[rowj_pos + k] -= a[rowi_pos + k] * aij;
+-    }
+-  }
+-
+-  if (index >= num) return;
+-
+-  if (col == 0) invalid[cam_id] = false;
+-  if (col < VN) {
+-    for (int i = 1; i < VN; ++i) {
+-      int rowi_pos = i << 3, dpos = i + rowi_pos;
+-      if (a[dpos] == 0) continue;
+-      if (col < i) {
+-        float sum = 0;
+-        for (int k = col; k < i; ++k)
+-          sum += (a[(k << 3) + i] * a[rowj_pos + k]);
+-        a[rowj_pos + i] = -sum * a[dpos];
+-      }
+-    }
+-    float ai[8], amax = 0;
+-    for (int i = 0; i < VN * 8; i += 8) {
+-      float sum = 0;
+-      for (int k = 0; k < VN; k++) sum += a[rowj_pos + k] * a[i + k];
+-      ai[i >> 3] = sum;
+-      amax = max(amax, sum);
+-    }
+-
+-    if (isinf(amax)) invalid[cam_id] = true;
+-    int write_pos = IMUL((index >> 3), (VN * 2)) + (col << 1);
+-    if (invalid[cam_id])  // a better way would be using a threshold
+-    {
+-      blocks[write_pos] = make_float4(0, 0, 0, 0);
+-      blocks[write_pos + 1] = make_float4(0, 0, 0, 0);
+-    } else {
+-      blocks[write_pos] = make_float4(ai[0], ai[1], ai[2], ai[3]);
+-      blocks[write_pos + 1] =
+-          make_float4(ai[4], ai[5], ai[6], VN < 8 ? 0 : ai[7]);
+-    }
+-  }
+-}
+-
+-void ProgramCU::ComputeDiagonalBlock(float lambda, bool dampd, CuTexImage& jc,
+-                                     CuTexImage& cmap, CuTexImage& jp,
+-                                     CuTexImage& pmap, CuTexImage& cmlist,
+-                                     CuTexImage& diag, CuTexImage& blocks,
+-                                     int radial_distortion, bool jc_transpose,
+-                                     bool add_existing_diagc, int mode) {
+-  size_t szjc = jc.GetDataSize();
+-  unsigned int ncam = (cmap.GetImgWidth() - 1);  // how many cameras
+-  float lambda1 = dampd ? 0.0f : lambda;
+-  float lambda2 = dampd ? (1.0f + lambda) : 1.0f;
+-  const unsigned int bheight = 2;
+-  dim3 block1x(32, bheight), grid1x((ncam + bheight - 1) / bheight);
+-  cmap.BindTexture(tex_jtjd_cmp);
+-
+-  if (mode == 2) {
+-    // point only mode?
+-  } else if (radial_distortion) {
+-    if (jc_transpose) {
+-      jtjd_cam_block_vec32_kernel<8, bheight, true><<<grid1x, block1x>>>(
+-          ncam, lambda1, lambda2, jc.data(), diag.data(), blocks.data(),
+-          add_existing_diagc);
+-    } else {
+-      cmlist.BindTexture(tex_jtjd_cmlist);
+-      jtjd_cam_block_vec32_kernel<8, bheight, false><<<grid1x, block1x>>>(
+-          ncam, lambda1, lambda2, jc.data(), diag.data(), blocks.data(),
+-          add_existing_diagc);
+-    }
+-  } else {
+-    if (jc_transpose) {
+-      jtjd_cam_block_vec32_kernel<7, bheight, true><<<grid1x, block1x>>>(
+-          ncam, lambda1, lambda2, jc.data(), diag.data(), blocks.data(),
+-          add_existing_diagc);
+-    } else {
+-      cmlist.BindTexture(tex_jtjd_cmlist);
+-      jtjd_cam_block_vec32_kernel<7, bheight, false><<<grid1x, block1x>>>(
+-          ncam, lambda1, lambda2, jc.data(), diag.data(), blocks.data(),
+-          add_existing_diagc);
+-    }
+-  }
+-  CheckErrorCUDA("ComputeDiagonalBlock<Camera>");
+-
+-  ////////////////////////////////////////////
+-  unsigned int npoint = (pmap.GetImgWidth() - 1);
+-  unsigned int len2 = npoint;
+-  unsigned int bsize2 = JTJD_POINT_BLOCK_KWIDTH;
+-  unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2;
+-  unsigned int bw, bh;
+-  unsigned int offsetd = 2 * ncam;
+-  unsigned int offsetb = (radial_distortion ? 16 : 14) * ncam;
+-  GetBlockConfiguration(nblock2, bw, bh);
+-  dim3 grid2(bw, bh), block2(bsize2);
+-  pmap.BindTexture(tex_jtjd_pmp);
+-  if (mode == 1) {
+-    // camera only mode?
+-  } else if (jp.GetDataSize() > MAX_TEXSIZE) {
+-    jp.BindTexture2(tex_jtjd_jp, tex_jtjd_jp2);
+-    jtjd_point_block_kernel<2><<<grid2, block2>>>(
+-        len2, (bw * bsize2), lambda1, lambda2, ((float4*)diag.data()) + offsetd,
+-        ((float4*)blocks.data()) + offsetb);
+-  } else {
+-    jp.BindTexture(tex_jtjd_jp);
+-    jtjd_point_block_kernel<1><<<grid2, block2>>>(
+-        len2, (bw * bsize2), lambda1, lambda2, ((float4*)diag.data()) + offsetd,
+-        ((float4*)blocks.data()) + offsetb);
+-  }
+-  CheckErrorCUDA("ComputeDiagonalBlock<Point>");
+-
+-  if (mode != 2) {
+-    unsigned int len3 = ncam * 8;
+-    unsigned int bsize3 = JTJD_BLOCK_CAM_INVERT_KWIDTH;
+-    unsigned int nblock3 = (len3 + bsize3 - 1) / bsize3;
+-    dim3 grid3(nblock3), block3(bsize3);
+-    if (radial_distortion)
+-      jtjd_cam_block_invert_kernel<8><<<grid3, block3>>>(
+-          len3, (float4*)blocks.data());
+-    else
+-      jtjd_cam_block_invert_kernel<7><<<grid3, block3>>>(
+-          len3, (float4*)blocks.data());
+-    CheckErrorCUDA("ComputeDiagonalBlockInverse<Camera>");
+-  }
+-}
+-
+-template <int WIDTH, int BBIT, int VSZ>
+-__global__ void multiply_block_conditioner_kernel(int num, int rowsz,
+-                                                  float* blocks, float* x,
+-                                                  float* result) {
+-  __shared__ float mat[WIDTH * VSZ];
+-  __shared__ float val[WIDTH];
+-  const int BSZ = 1 << BBIT;
+-  const int BMASK = BSZ - 1;
+-  int bindex = IMUL(blockIdx.x, blockDim.x) + rowsz * blockIdx.y;
+-  int index = bindex + threadIdx.x;
+-  int block_read_pos = bindex * VSZ;
+-  val[threadIdx.x] = x[index];
+-  for (int i = 0; i < VSZ * WIDTH; i += WIDTH)
+-    mat[i + threadIdx.x] = blocks[i + block_read_pos + threadIdx.x];
+-  __syncthreads();
+-  if (index >= num) return;
+-  float* ac = mat + (threadIdx.x >> BBIT) * (BSZ * VSZ) + (threadIdx.x & BMASK);
+-  float* xc = val + (threadIdx.x & (~BMASK));
+-  float sum = 0;
+-  for (int i = 0; i < VSZ; ++i) sum += ac[i << BBIT] * xc[i];
+-  result[index] = sum;  // isinf(sum) ? 0 : sum ; //
+-}
+-
+-void ProgramCU::MultiplyBlockConditioner(int ncam, int npoint,
+-                                         CuTexImage& blocks, CuTexImage& vector,
+-                                         CuTexImage& result, int radial,
+-                                         int mode) {
+-  const unsigned int bsize1 = 64;
+-  unsigned int bw, bh;
+-
+-  if (mode != 2) {
+-    unsigned int len1 = ncam * 8;
+-    unsigned int nblock1 = (len1 + bsize1 - 1) / bsize1;
+-    GetBlockConfiguration(nblock1, bw, bh);
+-    dim3 grid1(bw, bh), block1(bsize1);
+-    if (radial)
+-      multiply_block_conditioner_kernel<bsize1, 3, 8><<<grid1, block1>>>(
+-          len1, (bw * bsize1), blocks.data(), vector.data(), result.data());
+-    else
+-      multiply_block_conditioner_kernel<bsize1, 3, 7><<<grid1, block1>>>(
+-          len1, (bw * bsize1), blocks.data(), vector.data(), result.data());
+-    CheckErrorCUDA("MultiplyBlockConditioner<Camera>");
+-  }
+-
+-  if (mode != 1) {
+-    const unsigned int bsize2 = 128;
+-    unsigned int len2 = npoint * 4;
+-    unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2;
+-    unsigned int cbsz = radial ? 64 : 56;
+-    unsigned int offsetb = ncam * cbsz;
+-    unsigned int offsetd = ncam * 8;
+-    GetBlockConfiguration(nblock2, bw, bh);
+-    dim3 grid2(bw, bh), block2(bsize2);
+-    multiply_block_conditioner_kernel<bsize2, 2, 3><<<grid2, block2>>>(
+-        len2, (bw * bsize2), blocks.data() + offsetb, vector.data() + offsetd,
+-        result.data() + offsetd);
+-    CheckErrorCUDA("MultiplyBlockConditioner<Point>");
+-  }
+-}
+-
+-texture<float4, 1, cudaReadModeElementType> tex_shuffle_jc;
+-texture<int, 1, cudaReadModeElementType> tex_shuffle_map;
+-texture<float4, 1, cudaReadModeElementType> tex_shuffle_jc2;
+-template <int TEXN>
+-__global__ void shuffle_camera_jacobian_kernel(int num, int bwidth,
+-                                               float4* jc) {
+-  int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
+-  if (index >= num) return;
+-  int fetch_idx = tex1Dfetch(tex_shuffle_map, index >> 2);
+-  if (TEXN == 2) {
+-    int texidx = fetch_idx >> 23,
+-        fidx = ((fetch_idx & 0x7fffff) << 2) + (index & 0x3);
+-    if (texidx == 0)
+-      jc[index] = tex1Dfetch(tex_shuffle_jc, fidx);
+-    else if (texidx == 1)
+-      jc[index] = tex1Dfetch(tex_shuffle_jc2, fidx);
+-  }
+-  if (TEXN == 1) {
+-    jc[index] = tex1Dfetch(tex_shuffle_jc, (fetch_idx << 2) + (index & 0x3));
+-  }
+-}
+-
+-bool ProgramCU::ShuffleCameraJacobian(CuTexImage& jc, CuTexImage& map,
+-                                      CuTexImage& result) {
+-  if (!result.IsValid()) return false;
+-  size_t szjc = jc.GetDataSize();
+-  unsigned int len = map.GetImgWidth() * 4;
+-  unsigned int bsize = 128;
+-  unsigned int nblock = (len + bsize - 1) / bsize;
+-
+-  map.BindTexture(tex_shuffle_map);
+-
+-  if (szjc > 2 * MAX_TEXSIZE) {
+-    fprintf(stderr, "datasize way too big %lX, %lX+...\n", szjc,
+-            (szjc) / MAX_TEXSIZE);
+-    return false;
+-  } else if (szjc > MAX_TEXSIZE) {
+-    unsigned int bw, bh;
+-    GetBlockConfiguration(nblock, bw, bh);
+-    dim3 grid(bw, bh), block(bsize);
+-    jc.BindTexture2(tex_shuffle_jc, tex_shuffle_jc2);
+-    shuffle_camera_jacobian_kernel<2><<<grid, block>>>(len, (bw * bsize),
+-                                                       (float4*)result.data());
+-  } else {
+-    jc.BindTexture(tex_shuffle_jc);
+-    unsigned int bw, bh;
+-    GetBlockConfiguration(nblock, bw, bh);
+-    dim3 grid(bw, bh), block(bsize);
+-    shuffle_camera_jacobian_kernel<1><<<grid, block>>>(len, (bw * bsize),
+-                                                       (float4*)result.data());
+-  }
+-  CheckErrorCUDA("ShuffleCameraJacobian");
+-  return true;
+-}
+-
+-texture<float4, 1, cudaReadModeElementType> tex_mjx_jc;
+-texture<float4, 1, cudaReadModeElementType> tex_mjx_jc2;
+-texture<float4, 1, cudaReadModeElementType> tex_mjx_jc3;
+-texture<float4, 1, cudaReadModeElementType> tex_mjx_jc4;
+-texture<float4, 1, cudaReadModeElementType> tex_mjx_jp;
+-texture<float4, 1, cudaReadModeElementType> tex_mjx_jp2;
+-texture<int2, 1, cudaReadModeElementType> tex_mjx_idx;
+-texture<float4, 1, cudaReadModeElementType> tex_mjx_x;
+-
+-template <int TEXN>
+-__global__ void multiply_jx_kernel(int num, int bwidth, int offset,
+-                                   float* result) {
+-  int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
+-  if (index >= num) return;
+-
+-  if (TEXN == 4 && (index >> 24) == 3) {
+-    ////////////////////////////////////////////
+-    int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
+-    float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
+-    float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
+-    float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
+-
+-    ////////////////////////////////////////////
+-    float4 jp, jc1, jc2;
+-    jp = tex1Dfetch(tex_mjx_jp2, index & 0x1ffffff);
+-    jc1 = tex1Dfetch(tex_mjx_jc4, (index & 0xffffff) << 1);
+-    jc2 = tex1Dfetch(tex_mjx_jc4, ((index & 0xffffff) << 1) + 1);
+-
+-    /////////////////////////////////////
+-    result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
+-                    jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
+-                    jc2.z * xc2.z + jc2.w * xc2.w + jp.x * xp.x + jp.y * xp.y +
+-                    jp.z * xp.z;
+-  } else if (TEXN > 2 && (index >> 24) == 2) {
+-    ////////////////////////////////////////////
+-    int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
+-    float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
+-    float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
+-    float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
+-
+-    ////////////////////////////////////////////
+-    float4 jp, jc1, jc2;
+-    jp = tex1Dfetch(tex_mjx_jp2, index & 0x1ffffff);
+-    jc1 = tex1Dfetch(tex_mjx_jc3, (index & 0xffffff) << 1);
+-    jc2 = tex1Dfetch(tex_mjx_jc3, ((index & 0xffffff) << 1) + 1);
+-
+-    /////////////////////////////////////
+-    result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
+-                    jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
+-                    jc2.z * xc2.z + jc2.w * xc2.w + jp.x * xp.x + jp.y * xp.y +
+-                    jp.z * xp.z;
+-  } else if (TEXN > 1 && (index > 0xffffff)) {
+-    ////////////////////////////////////////////
+-    int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
+-    float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
+-    float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
+-    float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
+-
+-    ////////////////////////////////////////////
+-    float4 jp, jc1, jc2;
+-    jp = tex1Dfetch(tex_mjx_jp, index & 0x1ffffff);
+-    jc1 = tex1Dfetch(tex_mjx_jc2, (index & 0xffffff) << 1);
+-    jc2 = tex1Dfetch(tex_mjx_jc2, ((index & 0xffffff) << 1) + 1);
+-
+-    /////////////////////////////////////
+-    result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
+-                    jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
+-                    jc2.z * xc2.z + jc2.w * xc2.w + jp.x * xp.x + jp.y * xp.y +
+-                    jp.z * xp.z;
+-  } else {
+-    ////////////////////////////////////////////
+-    int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
+-    float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
+-    float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
+-    float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
+-
+-    ////////////////////////////////////////////
+-    float4 jp, jc1, jc2;
+-    jp = tex1Dfetch(tex_mjx_jp, index);
+-    jc1 = tex1Dfetch(tex_mjx_jc, index << 1);
+-    jc2 = tex1Dfetch(tex_mjx_jc, (index << 1) + 1);
+-
+-    /////////////////////////////////////
+-    result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
+-                    jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
+-                    jc2.z * xc2.z + jc2.w * xc2.w + jp.x * xp.x + jp.y * xp.y +
+-                    jp.z * xp.z;
+-  }
+-}
+-
+-template <int TEXN>
+-__global__ void multiply_jcx_kernel(int num, int bwidth, float* result) {
+-  int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
+-  if (index >= num) return;
+-
+-  if (TEXN == 4 && (index >> 24) == 3) {
+-    ////////////////////////////////////////////
+-    int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
+-    float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
+-    float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
+-
+-    ////////////////////////////////////////////
+-    float4 jc1, jc2;
+-    jc1 = tex1Dfetch(tex_mjx_jc4, (index & 0xffffff) << 1);
+-    jc2 = tex1Dfetch(tex_mjx_jc4, ((index & 0xffffff) << 1) + 1);
+-
+-    /////////////////////////////////////
+-    result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
+-                    jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
+-                    jc2.z * xc2.z + jc2.w * xc2.w;
+-  } else if (TEXN > 2 && (index >> 24) == 2) {
+-    ////////////////////////////////////////////
+-    int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
+-    float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
+-    float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
+-
+-    ////////////////////////////////////////////
+-    float4 jc1, jc2;
+-    jc1 = tex1Dfetch(tex_mjx_jc3, (index & 0xffffff) << 1);
+-    jc2 = tex1Dfetch(tex_mjx_jc3, ((index & 0xffffff) << 1) + 1);
+-
+-    /////////////////////////////////////
+-    result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
+-                    jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
+-                    jc2.z * xc2.z + jc2.w * xc2.w;
+-  } else if (TEXN > 1 && (index > 0xffffff)) {
+-    ////////////////////////////////////////////
+-    int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
+-    float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
+-    float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
+-
+-    ////////////////////////////////////////////
+-    float4 jc1, jc2;
+-    jc1 = tex1Dfetch(tex_mjx_jc2, (index & 0xffffff) << 1);
+-    jc2 = tex1Dfetch(tex_mjx_jc2, ((index & 0xffffff) << 1) + 1);
+-
+-    /////////////////////////////////////
+-    result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
+-                    jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
+-                    jc2.z * xc2.z + jc2.w * xc2.w;
+-  } else {
+-    ////////////////////////////////////////////
+-    int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
+-    float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
+-    float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
+-
+-    ////////////////////////////////////////////
+-    float4 jc1, jc2;
+-    jc1 = tex1Dfetch(tex_mjx_jc, index << 1);
+-    jc2 = tex1Dfetch(tex_mjx_jc, (index << 1) + 1);
+-
+-    /////////////////////////////////////
+-    result[index] = jc1.x * xc1.x + jc1.y * xc1.y + jc1.z * xc1.z +
+-                    jc1.w * xc1.w + jc2.x * xc2.x + jc2.y * xc2.y +
+-                    jc2.z * xc2.z + jc2.w * xc2.w;
+-  }
+-}
+-
+-template <int TEXN>
+-__global__ void multiply_jpx_kernel(int num, int bwidth, int offset,
+-                                    float* result) {
+-  int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
+-  if (index >= num) return;
+-
+-  if (TEXN == 2 && index > 0x1ffffff) {
+-    ////////////////////////////////////////////
+-    int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
+-    float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
+-    ////////////////////////////////////////////
+-    float4 jp = tex1Dfetch(tex_mjx_jp2, index & 0x1ffffff);
+-    /////////////////////////////////////
+-    result[index] = jp.x * xp.x + jp.y * xp.y + jp.z * xp.z;
+-  } else {
+-    ////////////////////////////////////////////
+-    int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
+-    float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
+-
+-    ////////////////////////////////////////////
+-    float4 jp = tex1Dfetch(tex_mjx_jp, index);
+-    /////////////////////////////////////
+-    result[index] = jp.x * xp.x + jp.y * xp.y + jp.z * xp.z;
+-  }
+-}
+-
+-template <int KW>
+-__global__ void multiply_jx_notex2_kernel(int num, int bwidth, int offset,
+-                                          float* jcx, float* jpx,
+-                                          float* result) {
+-  int bindex = blockIdx.x * blockDim.x + blockIdx.y * bwidth;
+-  int index = threadIdx.x + bindex;
+-
+-  ////////////////////////////////////////////
+-  int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
+-  float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
+-  float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
+-  float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
+-  ////////////////////////////////////////////
+-  __shared__ float jps[KW * 4];
+-  __shared__ float jcs[KW * 8];
+-
+-  for (int i = threadIdx.x; i < 4 * KW; i += KW)
+-    jps[i] = jpx[(bindex << 2) + i];
+-  for (int i = threadIdx.x; i < 8 * KW; i += KW)
+-    jcs[i] = jcx[(bindex << 3) + i];
+-
+-  __syncthreads();
+-  if (index >= num) return;
+-
+-  /////////////////////////////////////
+-  float *jp = jps + threadIdx.x * 4, *jc = jcs + threadIdx.x * 8;
+-  result[index] = jc[0] * xc1.x + jc[1] * xc1.y + jc[2] * xc1.z +
+-                  jc[3] * xc1.w + jc[4] * xc2.x + jc[5] * xc2.y +
+-                  jc[6] * xc2.z + jc[7] * xc2.w + jp[0] * xp.x + jp[1] * xp.y +
+-                  jp[2] * xp.z;
+-}
+-
+-template <int KW>
+-__global__ void multiply_jpx_notex2_kernel(int num, int bwidth, int offset,
+-                                           float* jpx, float* result) {
+-  int bindex = blockIdx.x * blockDim.x + blockIdx.y * bwidth;
+-  int index = threadIdx.x + bindex;
+-
+-  ////////////////////////////////////////////
+-  int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
+-  float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
+-  ////////////////////////////////////////////
+-  __shared__ float jps[KW * 4];
+-
+-  for (int i = threadIdx.x; i < 4 * KW; i += KW)
+-    jps[i] = jpx[(bindex << 2) + i];
+-
+-  __syncthreads();
+-  if (index >= num) return;
+-
+-  /////////////////////////////////////
+-  float* jp = jps + threadIdx.x * 4;
+-  result[index] = jp[0] * xp.x + jp[1] * xp.y + jp[2] * xp.z;
+-}
+-
+-template <int KW>
+-__global__ void multiply_jcx_notex2_kernel(int num, int bwidth, float* jcx,
+-                                           float* result) {
+-  int bindex = blockIdx.x * blockDim.x + blockIdx.y * bwidth;
+-  int index = threadIdx.x + bindex;
+-
+-  ////////////////////////////////////////////
+-  int2 proj = tex1Dfetch(tex_mjx_idx, index >> 1);
+-  float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
+-  float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
+-  ////////////////////////////////////////////
+-
+-  __shared__ float jcs[KW * 8];
+-  for (int i = threadIdx.x; i < 8 * KW; i += KW)
+-    jcs[i] = jcx[(bindex << 3) + i];
+-
+-  __syncthreads();
+-  if (index >= num) return;
+-
+-  /////////////////////////////////////
+-  float* jc = jcs + threadIdx.x * 8;
+-  result[index] = jc[0] * xc1.x + jc[1] * xc1.y + jc[2] * xc1.z +
+-                  jc[3] * xc1.w + jc[4] * xc2.x + jc[5] * xc2.y +
+-                  jc[6] * xc2.z + jc[7] * xc2.w;
+-}
+-
+-void ProgramCU::ComputeJX(int point_offset, CuTexImage& x, CuTexImage& jc,
+-                          CuTexImage& jp, CuTexImage& jmap, CuTexImage& result,
+-                          int mode) {
+-  // given a vector of parameters....
+-  // multiply the Jacobian Matrix with it [jc jp] * p
+-  // for each measurment, read back the jacobian
+-  // multiply and summ up th corresponding
+-
+-  unsigned int nproj = jmap.GetImgWidth();
+-  unsigned int len = nproj * 2;
+-  unsigned int bsize = 64;
+-  unsigned int nblock = (len + bsize - 1) / bsize;
+-  unsigned int bw, bh;
+-  jmap.BindTexture(tex_mjx_idx);
+-  x.BindTexture(tex_mjx_x);
+-
+-  if (mode == 0) {
+-    size_t szjc = jc.GetDataSize();
+-    if (TEX_TOOBIG4(szjc)) {
+-      GetBlockConfiguration(nblock, bw, bh);
+-      dim3 grid(bw, bh), block(bsize);
+-      multiply_jx_notex2_kernel<64><<<grid, block>>>(
+-          len, (bw * bsize), point_offset, jc.data(), jp.data(), result.data());
+-    } else if (szjc > 2 * MAX_TEXSIZE) {
+-      jp.BindTexture2(tex_mjx_jp, tex_mjx_jp2);
+-      jc.BindTexture4(tex_mjx_jc, tex_mjx_jc2, tex_mjx_jc3, tex_mjx_jc4);
+-      GetBlockConfiguration(nblock, bw, bh);
+-      dim3 grid(bw, bh), block(bsize);
+-      multiply_jx_kernel<4><<<grid, block>>>(len, (bw * bsize), point_offset,
+-                                             result.data());
+-    } else if (szjc > MAX_TEXSIZE) {
+-      jp.BindTexture(tex_mjx_jp);
+-      jc.BindTexture2(tex_mjx_jc, tex_mjx_jc2);
+-      GetBlockConfiguration(nblock, bw, bh);
+-      dim3 grid(bw, bh), block(bsize);
+-      multiply_jx_kernel<2><<<grid, block>>>(len, (bw * bsize), point_offset,
+-                                             result.data());
+-    } else {
+-      jp.BindTexture(tex_mjx_jp);
+-      jc.BindTexture(tex_mjx_jc);
+-      GetBlockConfiguration(nblock, bw, bh);
+-      dim3 grid(bh, bw), block(bsize);
+-      multiply_jx_kernel<1><<<grid, block>>>(len, (bh * bsize), point_offset,
+-                                             result.data());
+-    }
+-    CheckErrorCUDA("ComputeJX");
+-  } else if (mode == 1) {
+-    size_t szjc = jc.GetDataSize();
+-    if (TEX_TOOBIG4(szjc)) {
+-      GetBlockConfiguration(nblock, bw, bh);
+-      dim3 grid(bw, bh), block(bsize);
+-      multiply_jcx_notex2_kernel<64><<<grid, block>>>(len, (bw * bsize),
+-                                                      jc.data(), result.data());
+-    } else if (szjc > 2 * MAX_TEXSIZE) {
+-      jc.BindTexture4(tex_mjx_jc, tex_mjx_jc2, tex_mjx_jc3, tex_mjx_jc4);
+-      GetBlockConfiguration(nblock, bw, bh);
+-      dim3 grid(bw, bh), block(bsize);
+-      multiply_jcx_kernel<4><<<grid, block>>>(len, (bw * bsize), result.data());
+-    } else if (szjc > MAX_TEXSIZE) {
+-      jc.BindTexture2(tex_mjx_jc, tex_mjx_jc2);
+-      GetBlockConfiguration(nblock, bw, bh);
+-      dim3 grid(bw, bh), block(bsize);
+-      multiply_jcx_kernel<2><<<grid, block>>>(len, (bw * bsize), result.data());
+-    } else {
+-      jc.BindTexture(tex_mjx_jc);
+-      GetBlockConfiguration(nblock, bw, bh);
+-      dim3 grid(bh, bw), block(bsize);
+-      multiply_jcx_kernel<1><<<grid, block>>>(len, (bh * bsize), result.data());
+-    }
+-    CheckErrorCUDA("ComputeJCX");
+-  } else if (mode == 2) {
+-    size_t szjp = jp.GetDataSize();
+-    if (szjp > MAX_TEXSIZE) {
+-      jp.BindTexture(tex_mjx_jp);
+-      GetBlockConfiguration(nblock, bw, bh);
+-      dim3 grid(bw, bh), block(bsize);
+-      multiply_jpx_kernel<2><<<grid, block>>>(len, (bw * bsize), point_offset,
+-                                              result.data());
+-    } else {
+-      jp.BindTexture(tex_mjx_jp);
+-      GetBlockConfiguration(nblock, bw, bh);
+-      dim3 grid(bh, bw), block(bsize);
+-      multiply_jpx_kernel<1><<<grid, block>>>(len, (bh * bsize), point_offset,
+-                                              result.data());
+-    }
+-    CheckErrorCUDA("ComputeJPX");
+-  }
+-}
+-
+-template <bool md, bool pd>
+-__device__ void jacobian_internal(int camera_pos, int pt_pos, int tidx,
+-                                  float* r, float jic, float* jxc, float* jyc,
+-                                  float* jxp, float* jyp) {
+-  float m[3];
+-  float4 ft = tex1Dfetch(tex_jacobian_cam, camera_pos);
+-  float4 r1 = tex1Dfetch(tex_jacobian_cam, camera_pos + 1);
+-  r[0] = r1.x;
+-  r[1] = r1.y;
+-  r[2] = r1.z;
+-  r[3] = r1.w;
+-  float4 r2 = tex1Dfetch(tex_jacobian_cam, camera_pos + 2);
+-  r[4] = r2.x;
+-  r[5] = r2.y;
+-  r[6] = r2.z;
+-  r[7] = r2.w;
+-  float4 r3 = tex1Dfetch(tex_jacobian_cam, camera_pos + 3);
+-  r[8] = r3.x;
+-
+-  float4 temp = tex1Dfetch(tex_jacobian_pts, pt_pos);
+-  m[0] = temp.x;
+-  m[1] = temp.y;
+-  m[2] = temp.z;
+-
+-  float x0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2];
+-  float y0 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2];
+-  float z0 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2];
+-  float f_p2 = FDIV(ft.x, z0 + ft.w);
+-  float p0_p2 = FDIV(x0 + ft.y, z0 + ft.w);
+-  float p1_p2 = FDIV(y0 + ft.z, z0 + ft.w);
+-
+-  if (pd) {
+-    float rr1 = r3.y * p0_p2 * p0_p2;
+-    float rr2 = r3.y * p1_p2 * p1_p2;
+-    float f_p2_x = f_p2 * (1.0 + 3.0 * rr1 + rr2);
+-    float f_p2_y = f_p2 * (1.0 + 3.0 * rr2 + rr1);
+-
+-    JACOBIAN_SET_JC_BEGIN
+-    float jfc = jic * (1 + rr1 + rr2);
+-    float ft_x_pn = jic * ft.x * (p0_p2 * p0_p2 + p1_p2 * p1_p2);
+-    /////////////////////////////////////////////////////
+-    jxc[0] = p0_p2 * jfc;
+-    jxc[1] = f_p2_x;
+-    jxc[2] = 0;
+-    jxc[3] = -f_p2_x * p0_p2;
+-    jxc[4] = -f_p2_x * p0_p2 * y0;
+-    jxc[5] = f_p2_x * (z0 + x0 * p0_p2);
+-    jxc[6] = -f_p2_x * y0;
+-    jxc[7] = ft_x_pn * p0_p2;
+-
+-    jyc[0] = p1_p2 * jfc;
+-    jyc[1] = 0;
+-    jyc[2] = f_p2_y;
+-    jyc[3] = -f_p2_y * p1_p2;
+-    jyc[4] = -f_p2_y * (z0 + y0 * p1_p2);
+-    jyc[5] = f_p2_y * x0 * p1_p2;
+-    jyc[6] = f_p2_y * x0;
+-    jyc[7] = ft_x_pn * p1_p2;
+-    JACOBIAN_SET_JC_END
+-    ///////////////////////////////////
+-    jxp[0] = f_p2_x * (r[0] - r[6] * p0_p2);
+-    jxp[1] = f_p2_x * (r[1] - r[7] * p0_p2);
+-    jxp[2] = f_p2_x * (r[2] - r[8] * p0_p2);
+-    jyp[0] = f_p2_y * (r[3] - r[6] * p1_p2);
+-    jyp[1] = f_p2_y * (r[4] - r[7] * p1_p2);
+-    jyp[2] = f_p2_y * (r[5] - r[8] * p1_p2);
+-  } else {
+-    JACOBIAN_SET_JC_BEGIN
+-    jxc[0] = p0_p2 * jic;
+-    jxc[1] = f_p2;
+-    jxc[2] = 0;
+-    jxc[3] = -f_p2 * p0_p2;
+-    jxc[4] = -f_p2 * p0_p2 * y0;
+-    jxc[5] = f_p2 * (z0 + x0 * p0_p2);
+-    jxc[6] = -f_p2 * y0;
+-
+-    jyc[0] = p1_p2 * jic;
+-    jyc[1] = 0;
+-    jyc[2] = f_p2;
+-    jyc[3] = -f_p2 * p1_p2;
+-    jyc[4] = -f_p2 * (z0 + y0 * p1_p2);
+-    jyc[5] = f_p2 * x0 * p1_p2;
+-    jyc[6] = f_p2 * x0;
+-
+-    if (md) {
+-      float2 ms = tex1Dfetch(tex_jacobian_meas, tidx);
+-      float msn = (ms.x * ms.x + ms.y * ms.y) * jic;
+-      jxc[7] = -ms.x * msn;
+-      jyc[7] = -ms.y * msn;
+-    } else {
+-      jxc[7] = 0;
+-      jyc[7] = 0;
+-    }
+-    JACOBIAN_SET_JC_END
+-    ///////////////////////////////////
+-    jxp[0] = f_p2 * (r[0] - r[6] * p0_p2);
+-    jxp[1] = f_p2 * (r[1] - r[7] * p0_p2);
+-    jxp[2] = f_p2 * (r[2] - r[8] * p0_p2);
+-    jyp[0] = f_p2 * (r[3] - r[6] * p1_p2);
+-    jyp[1] = f_p2 * (r[4] - r[7] * p1_p2);
+-    jyp[2] = f_p2 * (r[5] - r[8] * p1_p2);
+-  }
+-}
+-
+-template <bool md, bool pd>
+-__device__ void jacobian_camera_internal(int camera_pos, int pt_pos, int tidx,
+-                                         float* r, float jic, float* jxc,
+-                                         float* jyc) {
+-  float m[3];
+-  float4 ft = tex1Dfetch(tex_jacobian_cam, camera_pos);
+-  float4 r1 = tex1Dfetch(tex_jacobian_cam, camera_pos + 1);
+-  r[0] = r1.x;
+-  r[1] = r1.y;
+-  r[2] = r1.z;
+-  r[3] = r1.w;
+-  float4 r2 = tex1Dfetch(tex_jacobian_cam, camera_pos + 2);
+-  r[4] = r2.x;
+-  r[5] = r2.y;
+-  r[6] = r2.z;
+-  r[7] = r2.w;
+-  float4 r3 = tex1Dfetch(tex_jacobian_cam, camera_pos + 3);
+-  r[8] = r3.x;
+-
+-  float4 temp = tex1Dfetch(tex_jacobian_pts, pt_pos);
+-  m[0] = temp.x;
+-  m[1] = temp.y;
+-  m[2] = temp.z;
+-
+-  float x0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2];
+-  float y0 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2];
+-  float z0 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2];
+-  float f_p2 = FDIV(ft.x, z0 + ft.w);
+-  float p0_p2 = FDIV(x0 + ft.y, z0 + ft.w);
+-  float p1_p2 = FDIV(y0 + ft.z, z0 + ft.w);
+-#ifndef PBA_DISABLE_CONST_CAMERA
+-  if (r3.w != 0.0f) {
+-    jxc[0] = 0;
+-    jxc[1] = 0;
+-    jxc[2] = 0;
+-    jxc[3] = 0;
+-    jxc[4] = 0;
+-    jxc[5] = 0;
+-    jxc[6] = 0;
+-    jxc[7] = 0;
+-    jyc[0] = 0;
+-    jyc[1] = 0;
+-    jyc[2] = 0;
+-    jyc[3] = 0;
+-    jyc[4] = 0;
+-    jyc[5] = 0;
+-    jyc[6] = 0;
+-    jyc[7] = 0;
+-  } else
+-#endif
+-      if (pd) {
+-    float rr1 = r3.y * p0_p2 * p0_p2;
+-    float rr2 = r3.y * p1_p2 * p1_p2;
+-    float f_p2_x = f_p2 * (1.0 + 3.0 * rr1 + rr2);
+-    float f_p2_y = f_p2 * (1.0 + 3.0 * rr2 + rr1);
+-    float jfc = jic * (1 + rr1 + rr2);
+-    float ft_x_pn = jic * ft.x * (p0_p2 * p0_p2 + p1_p2 * p1_p2);
+-    /////////////////////////////////////////////////////
+-    jxc[0] = p0_p2 * jfc;
+-    jxc[1] = f_p2_x;
+-    jxc[2] = 0;
+-    jxc[3] = -f_p2_x * p0_p2;
+-    jxc[4] = -f_p2_x * p0_p2 * y0;
+-    jxc[5] = f_p2_x * (z0 + x0 * p0_p2);
+-    jxc[6] = -f_p2_x * y0;
+-    jxc[7] = ft_x_pn * p0_p2;
+-
+-    jyc[0] = p1_p2 * jfc;
+-    jyc[1] = 0;
+-    jyc[2] = f_p2_y;
+-    jyc[3] = -f_p2_y * p1_p2;
+-    jyc[4] = -f_p2_y * (z0 + y0 * p1_p2);
+-    jyc[5] = f_p2_y * x0 * p1_p2;
+-    jyc[6] = f_p2_y * x0;
+-    jyc[7] = ft_x_pn * p1_p2;
+-  } else {
+-    jxc[0] = p0_p2 * jic;
+-    jxc[1] = f_p2;
+-    jxc[2] = 0;
+-    jxc[3] = -f_p2 * p0_p2;
+-    jxc[4] = -f_p2 * p0_p2 * y0;
+-    jxc[5] = f_p2 * (z0 + x0 * p0_p2);
+-    jxc[6] = -f_p2 * y0;
+-
+-    jyc[0] = p1_p2 * jic;
+-    jyc[1] = 0;
+-    jyc[2] = f_p2;
+-    jyc[3] = -f_p2 * p1_p2;
+-    jyc[4] = -f_p2 * (z0 + y0 * p1_p2);
+-    jyc[5] = f_p2 * x0 * p1_p2;
+-    jyc[6] = f_p2 * x0;
+-
+-    if (md) {
+-      float2 ms = tex1Dfetch(tex_jacobian_meas, tidx);
+-      float msn = (ms.x * ms.x + ms.y * ms.y) * jic;
+-      jxc[7] = -ms.x * msn;
+-      jyc[7] = -ms.y * msn;
+-    } else {
+-      jxc[7] = 0;
+-      jyc[7] = 0;
+-    }
+-  }
+-}
+-
+-template <bool pd>
+-__device__ void jacobian_point_internal(int camera_pos, int pt_pos, int tidx,
+-                                        float* r, float* jxp, float* jyp) {
+-  float m[3];
+-  float4 ft = tex1Dfetch(tex_jacobian_cam, camera_pos);
+-  float4 r1 = tex1Dfetch(tex_jacobian_cam, camera_pos + 1);
+-  r[0] = r1.x;
+-  r[1] = r1.y;
+-  r[2] = r1.z;
+-  r[3] = r1.w;
+-  float4 r2 = tex1Dfetch(tex_jacobian_cam, camera_pos + 2);
+-  r[4] = r2.x;
+-  r[5] = r2.y;
+-  r[6] = r2.z;
+-  r[7] = r2.w;
+-  float4 r3 = tex1Dfetch(tex_jacobian_cam, camera_pos + 3);
+-  r[8] = r3.x;
+-
+-  float4 temp = tex1Dfetch(tex_jacobian_pts, pt_pos);
+-  m[0] = temp.x;
+-  m[1] = temp.y;
+-  m[2] = temp.z;
+-
+-  float x0 = r[0] * m[0] + r[1] * m[1] + r[2] * m[2];
+-  float y0 = r[3] * m[0] + r[4] * m[1] + r[5] * m[2];
+-  float z0 = r[6] * m[0] + r[7] * m[1] + r[8] * m[2];
+-  float f_p2 = FDIV(ft.x, z0 + ft.w);
+-  float p0_p2 = FDIV(x0 + ft.y, z0 + ft.w);
+-  float p1_p2 = FDIV(y0 + ft.z, z0 + ft.w);
+-
+-  if (pd) {
+-    float rr1 = r3.y * p0_p2 * p0_p2;
+-    float rr2 = r3.y * p1_p2 * p1_p2;
+-    float f_p2_x = f_p2 * (1.0 + 3.0 * rr1 + rr2);
+-    float f_p2_y = f_p2 * (1.0 + 3.0 * rr2 + rr1);
+-    ///////////////////////////////////
+-    jxp[0] = f_p2_x * (r[0] - r[6] * p0_p2);
+-    jxp[1] = f_p2_x * (r[1] - r[7] * p0_p2);
+-    jxp[2] = f_p2_x * (r[2] - r[8] * p0_p2);
+-    jyp[0] = f_p2_y * (r[3] - r[6] * p1_p2);
+-    jyp[1] = f_p2_y * (r[4] - r[7] * p1_p2);
+-    jyp[2] = f_p2_y * (r[5] - r[8] * p1_p2);
+-  } else {
+-    ///////////////////////////////////
+-    jxp[0] = f_p2 * (r[0] - r[6] * p0_p2);
+-    jxp[1] = f_p2 * (r[1] - r[7] * p0_p2);
+-    jxp[2] = f_p2 * (r[2] - r[8] * p0_p2);
+-    jyp[0] = f_p2 * (r[3] - r[6] * p1_p2);
+-    jyp[1] = f_p2 * (r[4] - r[7] * p1_p2);
+-    jyp[2] = f_p2 * (r[5] - r[8] * p1_p2);
+-  }
+-}
+-
+-template <bool md, bool pd>
+-__global__ void multiply_jx_noj_kernel(int num, int bwidth, int offset,
+-                                       float jic, float2* result) {
+-  int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
+-  if (index >= num) return;
+-
+-  __shared__ float data[9 * 64];
+-  ////////////////////////////////////////////
+-  int2 proj = tex1Dfetch(tex_mjx_idx, index);
+-  float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
+-  float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
+-  float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
+-
+-  ////////////////////////////////////////////
+-  float jxc[8], jyc[8], jxp[3], jyp[3];
+-  jacobian_internal<md, pd>(proj.x << 1, proj.y, index, data + 9 * threadIdx.x,
+-                            jic, jxc, jyc, jxp, jyp);
+-
+-  /////////////////////////////////////
+-  result[index] = make_float2(
+-      jxc[0] * xc1.x + jxc[1] * xc1.y + jxc[2] * xc1.z + jxc[3] * xc1.w +
+-          jxc[4] * xc2.x + jxc[5] * xc2.y + jxc[6] * xc2.z + jxc[7] * xc2.w +
+-          jxp[0] * xp.x + jxp[1] * xp.y + jxp[2] * xp.z,
+-      jyc[0] * xc1.x + jyc[1] * xc1.y + jyc[2] * xc1.z + jyc[3] * xc1.w +
+-          jyc[4] * xc2.x + jyc[5] * xc2.y + jyc[6] * xc2.z + jyc[7] * xc2.w +
+-          jyp[0] * xp.x + jyp[1] * xp.y + jyp[2] * xp.z);
+-}
+-
+-template <bool md, bool pd>
+-__global__ void multiply_jcx_noj_kernel(int num, int bwidth, float jic,
+-                                        float2* result) {
+-  int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
+-  if (index >= num) return;
+-
+-  __shared__ float data[9 * 64];
+-  ////////////////////////////////////////////
+-  int2 proj = tex1Dfetch(tex_mjx_idx, index);
+-  float4 xc1 = tex1Dfetch(tex_mjx_x, proj.x);
+-  float4 xc2 = tex1Dfetch(tex_mjx_x, proj.x + 1);
+-
+-  ////////////////////////////////////////////
+-  float jxc[8], jyc[8];
+-  jacobian_camera_internal<md, pd>(proj.x << 1, proj.y, index,
+-                                   data + 9 * threadIdx.x, jic, jxc, jyc);
+-
+-  /////////////////////////////////////
+-  result[index] = make_float2(
+-      jxc[0] * xc1.x + jxc[1] * xc1.y + jxc[2] * xc1.z + jxc[3] * xc1.w +
+-          jxc[4] * xc2.x + jxc[5] * xc2.y + jxc[6] * xc2.z + jxc[7] * xc2.w,
+-      jyc[0] * xc1.x + jyc[1] * xc1.y + jyc[2] * xc1.z + jyc[3] * xc1.w +
+-          jyc[4] * xc2.x + jyc[5] * xc2.y + jyc[6] * xc2.z + jyc[7] * xc2.w);
+-}
+-
+-template <bool pd>
+-__global__ void multiply_jpx_noj_kernel(int num, int bwidth, int offset,
+-                                        float2* result) {
+-  int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
+-  if (index >= num) return;
+-
+-  __shared__ float data[9 * 64];
+-  ////////////////////////////////////////////
+-  int2 proj = tex1Dfetch(tex_mjx_idx, index);
+-  float4 xp = tex1Dfetch(tex_mjx_x, proj.y + offset);
+-
+-  ////////////////////////////////////////////
+-  float jxp[3], jyp[3];
+-  jacobian_point_internal<pd>(proj.x << 1, proj.y, index,
+-                              data + 9 * threadIdx.x, jxp, jyp);
+-
+-  /////////////////////////////////////
+-  result[index] = make_float2(jxp[0] * xp.x + jxp[1] * xp.y + jxp[2] * xp.z,
+-                              jyp[0] * xp.x + jyp[1] * xp.y + jyp[2] * xp.z);
+-}
+-
+-void ProgramCU::ComputeJX_(CuTexImage& x, CuTexImage& jx, CuTexImage& camera,
+-                           CuTexImage& point, CuTexImage& meas,
+-                           CuTexImage& pjmap, bool intrinsic_fixed,
+-                           int radial_distortion, int mode) {
+-  unsigned int nproj = pjmap.GetImgWidth();
+-  unsigned int len = nproj;
+-  unsigned int bsize = 64;
+-  unsigned int nblock = (len + bsize - 1) / bsize;
+-  unsigned int bw, bh;
+-  int point_offset = camera.GetImgWidth() * 2;
+-  float jfc = intrinsic_fixed ? 0 : 1.0f;
+-
+-  /////////////////////////////
+-  pjmap.BindTexture(tex_mjx_idx);
+-  x.BindTexture(tex_mjx_x);
+-  camera.BindTexture(tex_jacobian_cam);
+-  point.BindTexture(tex_jacobian_pts);
+-
+-  ///////////////////////////////////
+-  GetBlockConfiguration(nblock, bw, bh);
+-  dim3 grid(bw, bh), block(bsize);
+-
+-  if (mode == 0) {
+-    if (radial_distortion == -1) {
+-      meas.BindTexture(tex_jacobian_meas);
+-      multiply_jx_noj_kernel<true, false><<<grid, block>>>(
+-          len, (bw * bsize), point_offset, jfc, (float2*)jx.data());
+-    } else if (radial_distortion) {
+-      multiply_jx_noj_kernel<false, true><<<grid, block>>>(
+-          len, (bw * bsize), point_offset, jfc, (float2*)jx.data());
+-    } else {
+-      multiply_jx_noj_kernel<false, false><<<grid, block>>>(
+-          len, (bw * bsize), point_offset, jfc, (float2*)jx.data());
+-    }
+-
+-    CheckErrorCUDA("ComputeJX_");
+-  } else if (mode == 1) {
+-    if (radial_distortion == -1) {
+-      meas.BindTexture(tex_jacobian_meas);
+-      multiply_jcx_noj_kernel<true, false><<<grid, block>>>(
+-          len, (bw * bsize), jfc, (float2*)jx.data());
+-    } else if (radial_distortion) {
+-      multiply_jcx_noj_kernel<false, true><<<grid, block>>>(
+-          len, (bw * bsize), jfc, (float2*)jx.data());
+-    } else {
+-      multiply_jcx_noj_kernel<false, false><<<grid, block>>>(
+-          len, (bw * bsize), jfc, (float2*)jx.data());
+-    }
+-
+-    CheckErrorCUDA("ComputeJCX_");
+-  } else if (mode == 2) {
+-    if (radial_distortion == 1) {
+-      multiply_jpx_noj_kernel<true><<<grid, block>>>(
+-          len, (bw * bsize), point_offset, (float2*)jx.data());
+-    } else {
+-      multiply_jpx_noj_kernel<false><<<grid, block>>>(
+-          len, (bw * bsize), point_offset, (float2*)jx.data());
+-    }
+-
+-    CheckErrorCUDA("ComputeJX_");
+-  }
+-}
+-
+-template <bool md, bool pd, int KH>
+-__global__ void jte_cam_vec_noj_kernel(int num, int rowsz, float jic,
+-                                       float* jte) {
+-  __shared__ float value[KH * 32 * 9];  // 8 * KH * 32
+-  int cam = blockIdx.x * KH + threadIdx.y + blockIdx.y * rowsz;
+-  if (cam >= num) return;
+-
+-  // read data range for this camera
+-  // 8 thread will do the same thing
+-  int idx1 = tex1Dfetch(tex_jte_cmp, cam);  // first camera
+-  int idx2 = tex1Dfetch(tex_jte_cmp, cam + 1);  // last camera + 1
+-
+-  float* valuec = value + 32 * 9 * threadIdx.y;
+-  float* rp = valuec + threadIdx.x * 9;
+-  float rr[8], jxc[8], jyc[8];
+-  for (int i = 0; i < 8; ++i) rr[i] = 0;
+-
+-  // loop to read the index of the projection.
+-  // so to get the location to read the jacobian
+-  for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
+-    int index = tex1Dfetch(tex_jte_cmt, i);
+-    int2 proj = tex1Dfetch(tex_jacobian_idx, index);
+-    jacobian_camera_internal<md, pd>(cam << 2, proj.y, index, rp, jic, jxc,
+-                                     jyc);
+-    float2 vv = tex1Dfetch(tex_jte_pe, index);
+-    //
+-    for (int j = 0; j < 8; ++j) rr[j] += (jxc[j] * vv.x + jyc[j] * vv.y);
+-  }
+-
+-  float* valuei = valuec + 8 * threadIdx.x;
+-  for (int i = 0; i < 8; ++i) valuei[i] = rr[i];
+-  valuec[threadIdx.x] = (valuec[threadIdx.x] + valuec[threadIdx.x + 32] +
+-                         valuec[threadIdx.x + 64] + valuec[threadIdx.x + 96] +
+-                         valuec[threadIdx.x + 128] + valuec[threadIdx.x + 160] +
+-                         valuec[threadIdx.x + 192] + valuec[threadIdx.x + 224]);
+-  if (threadIdx.x < 16) valuec[threadIdx.x] += valuec[threadIdx.x + 16];
+-  if (threadIdx.x < 8)
+-    valuec[threadIdx.x] = valuec[threadIdx.x] + valuec[threadIdx.x + 8];
+-
+-  ////////////////////////////////////
+-  if (threadIdx.x < 8) jte[(cam << 3) + threadIdx.x] = valuec[threadIdx.x];
+-}
+-
+-template <bool pd, int KH>
+-__global__ void jte_point_vec_noj_kernel(int num, int rowsz, float* jte) {
+-  ////////////////////////////
+-  __shared__ float value[KH * (9 * 32)];
+-  int index = blockIdx.x * KH + threadIdx.y + blockIdx.y * rowsz;
+-  if (index >= num) return;
+-
+-  int idx1 = tex1Dfetch(tex_jte_pmp, index);  // first
+-  int idx2 = tex1Dfetch(tex_jte_pmp, index + 1);  // last + 1
+-  float rx = 0, ry = 0, rz = 0, jxp[3], jyp[3];
+-  int rowp = threadIdx.y * 9 * 32;
+-  float* rp = value + threadIdx.x * 9 + rowp;
+-  for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
+-    float2 ev = tex1Dfetch(tex_jte_pe, i);
+-    int2 proj = tex1Dfetch(tex_jacobian_idx, i);
+-    jacobian_point_internal<pd>(proj.x << 1, proj.y, i, rp, jxp, jyp);
+-    rx += (jxp[0] * ev.x + jyp[0] * ev.y);
+-    ry += (jxp[1] * ev.x + jyp[1] * ev.y);
+-    rz += (jxp[2] * ev.x + jyp[2] * ev.y);
+-  }
+-
+-  int loc = (threadIdx.x << 2) + rowp;
+-  value[loc] = rx;
+-  value[loc + 1] = ry;
+-  value[loc + 2] = rz;
+-  value[loc + 3] = 0;
+-
+-  int ridx = threadIdx.x + rowp;
+-  value[ridx] = ((value[ridx] + value[ridx + 32]) +
+-                 (value[ridx + 64] + value[ridx + 96]));
+-  if (threadIdx.x < 16) value[ridx] += value[ridx + 16];
+-  if (threadIdx.x < 8) value[ridx] += value[ridx + 8];
+-  if (threadIdx.x < 4)
+-    jte[(index << 2) + threadIdx.x] = value[ridx] + value[ridx + 4];
+-}
+-
+-void ProgramCU::ComputeJtE_(CuTexImage& e, CuTexImage& jte, CuTexImage& camera,
+-                            CuTexImage& point, CuTexImage& meas,
+-                            CuTexImage& cmap, CuTexImage& cmlist,
+-                            CuTexImage& pmap, CuTexImage& pjmap, CuTexImage& jp,
+-                            bool intrinsic_fixed, int radial_distortion,
+-                            int mode) {
+-  pjmap.BindTexture(tex_jacobian_idx);
+-  camera.BindTexture(tex_jacobian_cam);
+-  point.BindTexture(tex_jacobian_pts);
+-  if (radial_distortion) meas.BindTexture(tex_jacobian_meas);
+-
+-  cmap.BindTexture(tex_jte_cmp);
+-  cmlist.BindTexture(tex_jte_cmt);
+-  e.BindTexture(tex_jte_pe);
+-
+-  //
+-  unsigned int bw, bh;
+-  float jfc = intrinsic_fixed ? 0 : 1.0f;
+-  int ncam = camera.GetImgWidth();
+-  const int bheight1 = 2, bsize = 32;
+-  int nblock1 = (ncam + bheight1 - 1) / bheight1;
+-  GetBlockConfiguration(nblock1, bw, bh);
+-  dim3 grid(bw, bh), block(bsize, bheight1);
+-  if (mode == 2) {
+-  } else if (radial_distortion == -1)
+-    jte_cam_vec_noj_kernel<true, false, bheight1><<<grid, block>>>(
+-        ncam, bw * bheight1, jfc, jte.data());
+-  else if (radial_distortion)
+-    jte_cam_vec_noj_kernel<false, true, bheight1><<<grid, block>>>(
+-        ncam, bw * bheight1, jfc, jte.data());
+-  else
+-    jte_cam_vec_noj_kernel<false, false, bheight1><<<grid, block>>>(
+-        ncam, bw * bheight1, jfc, jte.data());
+-  CheckErrorCUDA("ComputeJtE_<Camera>");
+-
+-  int npt = point.GetImgWidth();
+-  unsigned int offsetv = 8 * ncam;
+-  const int bheight2 = 2, bsize2 = 32;
+-  int nblock2 = (npt + bheight2 - 1) / bheight2;
+-  GetBlockConfiguration(nblock2, bw, bh);
+-  dim3 grid2(bw, bh), block2(bsize2, bheight2);
+-  if (mode == 1) {
+-  } else if (jp.IsValid()) {
+-    pmap.BindTexture(tex_jte_pmp);
+-    e.BindTexture(tex_jte_pex);
+-    jp.BindTexture2(tex_jte_jp, tex_jte_jp2);
+-    if (jp.GetDataSize() > MAX_TEXSIZE)
+-      jte_point_vec_kernel<bheight2, 2><<<grid2, block2>>>(
+-          npt, bw * bheight2, jte.data() + offsetv);
+-    else
+-      jte_point_vec_kernel<bheight2, 1><<<grid2, block2>>>(
+-          npt, bw * bheight2, jte.data() + offsetv);
+-  } else {
+-    pmap.BindTexture(tex_jte_pmp);
+-    if (radial_distortion && radial_distortion != -1)
+-      jte_point_vec_noj_kernel<true, bheight2><<<grid2, block2>>>(
+-          npt, bw * bheight2, jte.data() + offsetv);
+-    else
+-      jte_point_vec_noj_kernel<false, bheight2><<<grid2, block2>>>(
+-          npt, bw * bheight2, jte.data() + offsetv);
+-  }
+-  CheckErrorCUDA("ComputeJtE_<Point>");
+-}
+-
+-template <int KH, bool md, bool pd, bool scaling>
+-__global__ void jtjd_cam_block_noj_kernel(int num, int rowsz, float lambda1,
+-                                          float lambda2, float jic, float* diag,
+-                                          float* blocks,
+-                                          bool add_existing_diagc) {
+-  const int VN = (md || pd) ? 8 : 7;
+-  __shared__ float buffer_all[32 * 9 * KH];
+-  __shared__ float value_all[64 * KH];
+-
+-  // 8thread per camera
+-  int bcam = blockIdx.x * KH + blockIdx.y * rowsz;
+-
+-  int cam = bcam + threadIdx.y;
+-  if (cam >= num) return;
+-
+-  float* buffer = buffer_all + threadIdx.y * (32 * 9);
+-  float* value = value_all + threadIdx.y * 64;
+-
+-  float jxc[8], jyc[8];
+-  float* rp = buffer + threadIdx.x * 9;
+-  float row0[VN], row1[VN - 1], row2[VN - 2], row3[VN - 3];
+-  float row4[VN - 4], row5[VN - 5], row6[VN - 6], row7[1] = {0};
+-  // read data range for this camera
+-  // 8 thread will do the same thing
+-  int idx1 = tex1Dfetch(tex_jtjd_cmp, cam);  // first camera
+-  int idx2 = tex1Dfetch(tex_jtjd_cmp, cam + 1);  // last camera + 1
+-
+-#define REPEAT7(FUNC) \
+-  FUNC(0);            \
+-  FUNC(1);            \
+-  FUNC(2);            \
+-  FUNC(3);            \
+-  FUNC(4);            \
+-  FUNC(5);            \
+-  FUNC(6);
+-#define SETZERO(k) \
+-  for (int j = 0; j < VN - k; ++j) row##k[j] = 0;
+-  REPEAT7(SETZERO);
+-
+-  float4 sjv[2];
+-  if (scaling && (pd || md)) {
+-    sjv[0] = tex1Dfetch(tex_jacobian_sj, (cam << 1));
+-    sjv[1] = tex1Dfetch(tex_jacobian_sj, (cam << 1) + 1);
+-  }
+-
+-  // loop to read the index of the projection.
+-  // so to get the location to read the jacobian
+-  for (int i = idx1 + threadIdx.x; i < idx2; i += 32) {
+-    /////////////////////////////////////////
+-    int index = tex1Dfetch(tex_jtjd_cmlist, i);
+-    int2 proj = tex1Dfetch(tex_jacobian_idx, index);
+-
+-    ///////////////////////////////////////////////
+-    jacobian_camera_internal<md, pd>(cam << 2, proj.y, index, rp, jic, jxc,
+-                                     jyc);
+-
+-    if (scaling && (pd || md)) {
+-      float* sj = (float*)sjv;  // 32 threads...64 values
+-      for (int j = 0; j < VN; ++j) {
+-        jxc[j] *= sj[j];
+-        jyc[j] *= sj[j];
+-      }
+-    }
+-
+-////////////////////////////////////////////////
+-#define ADDROW(k)              \
+-  for (int j = k; j < VN; ++j) \
+-  row##k[j - k] += (jxc[k] * jxc[j] + jyc[k] * jyc[j])
+-
+-    ///////////////
+-    REPEAT7(ADDROW);
+-    if (VN == 8) {
+-      ADDROW(7);
+-    }
+-  }
+-
+-////////////////////////////////////
+-// make the matrix..//add up the 32 * 8 matrix
+-#define JTJDSUM8_V1()                                          \
+-  buffer[threadIdx.x] =                                        \
+-      (buffer[threadIdx.x] + buffer[threadIdx.x + 32] +        \
+-       buffer[threadIdx.x + 64] + buffer[threadIdx.x + 96] +   \
+-       buffer[threadIdx.x + 128] + buffer[threadIdx.x + 160] + \
+-       buffer[threadIdx.x + 192] + buffer[threadIdx.x + 224]);
+-
+-#define JTJDSUM8_V2()                                             \
+-  buffer[threadIdx.x] =                                           \
+-      (((buffer[threadIdx.x] + buffer[threadIdx.x + 128]) +       \
+-        (buffer[threadIdx.x + 64] + buffer[threadIdx.x + 192])) + \
+-       ((buffer[threadIdx.x + 32] + buffer[threadIdx.x + 160]) +  \
+-        (buffer[threadIdx.x + 96] + buffer[threadIdx.x + 224])));
+-
+-#define STORE_ROWS(k)                                                        \
+-  for (int i = 0; i < (VN - k); ++i) bufi[i] = row##k[i];                    \
+-  JTJDSUM8_V2();                                                             \
+-  if (threadIdx.x < 16 - k) buffer[threadIdx.x] += buffer[threadIdx.x + 16]; \
+-  if (threadIdx.x < 8 - k)                                                   \
+-    value[threadIdx.x + k * 9] = buffer[threadIdx.x] + buffer[threadIdx.x + 8];
+-
+-  float* bufi = buffer + threadIdx.x * 8;
+-  REPEAT7(STORE_ROWS);
+-  if (VN == 8) {
+-    STORE_ROWS(7);
+-  }
+-
+-  /////////////////////////////////////////////////////////////////////////////////////////////
+-
+-  ////////////////////////////////    (8 * i + j) -> (8 * j + i)
+-  //#define COPYSYM(i) if(threadIdx.x < VN - i - 1) value[threadIdx.x * 8 +  i *
+-  //9 + 8] = value[threadIdx.x +  i * 9 + 1];
+-  if (threadIdx.x < VN - 1) value[threadIdx.x * 8 + 8] = value[threadIdx.x + 1];
+-  if (threadIdx.x < VN - 2)
+-    value[threadIdx.x * 8 + 17] = value[threadIdx.x + 10];
+-  if (threadIdx.x < VN - 3)
+-    value[threadIdx.x * 8 + 26] = value[threadIdx.x + 19];
+-  if (threadIdx.x < VN - 4)
+-    value[threadIdx.x * 8 + 35] = value[threadIdx.x + 28];
+-  if (threadIdx.x < VN - 5)
+-    value[threadIdx.x * 8 + 44] = value[threadIdx.x + 37];
+-  if (threadIdx.x < VN - 6)
+-    value[threadIdx.x * 8 + 53] = value[threadIdx.x + 46];
+-  if (VN == 8 && threadIdx.x < VN - 7)
+-    value[threadIdx.x * 8 + 62] = value[threadIdx.x + 55];
+-
+-  if (scaling && !pd && !md) {
+-    float4 sjv[2];
+-    float* sj = (float*)sjv;  // 32 threads...64 values
+-    sjv[0] = tex1Dfetch(tex_jacobian_sj, (cam << 1));
+-    sjv[1] = tex1Dfetch(tex_jacobian_sj, (cam << 1) + 1);
+-    float sji = sj[threadIdx.x & 0x07];
+-    value[threadIdx.x] *= (sji * sj[threadIdx.x / 8]);
+-    value[threadIdx.x + 32] *= (sji * sj[4 + threadIdx.x / 8]);
+-  }
+-
+-  bool zero = ((threadIdx.x & 0x7) == VN);
+-
+-  ///////////write back
+-  if (threadIdx.x < 8) {
+-    float* dp = value + threadIdx.x * 9;
+-    float temp = zero ? 0 : dp[0];
+-    int didx = threadIdx.x + (cam << 3);
+-    if (add_existing_diagc) temp += diag[didx];
+-    diag[didx] = temp;
+-    dp[0] = lambda1 + lambda2 * temp;
+-  }
+-  int wpos = cam * (8 * VN) + threadIdx.x;
+-  blocks[wpos] = zero ? 0 : value[threadIdx.x];
+-  if (threadIdx.x < VN * 8 - 32)
+-    blocks[wpos + 32] = zero ? 0 : value[threadIdx.x + 32];
+-}
+-
+-template <int KW, bool pd, bool scaling>
+-__global__ void jtjd_point_block_noj_kernel(int num, int rowsz, float lambda1,
+-                                            float lambda2, float4* diag,
+-                                            float4* blocks, int ptx) {
+-  ////////////////////////////
+-  int index = blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * rowsz;
+-  if (index >= num) return;
+-
+-  __shared__ float value[KW * 9];
+-  int idx1 = tex1Dfetch(tex_jtjd_pmp, index);  // first
+-  int idx2 = tex1Dfetch(tex_jtjd_pmp, index + 1);  // last + 1
+-
+-  float M00 = 0, M01 = 0, M02 = 0, M11 = 0, M12 = 0, M22 = 0;
+-  float jxp[3], jyp[3];
+-  float* rp = value + threadIdx.x * 9;
+-
+-  float4 sj;
+-  if (scaling && pd) sj = tex1Dfetch(tex_jacobian_sj, index + ptx);
+-
+-  for (int i = idx1; i < idx2; ++i) {
+-    int2 proj = tex1Dfetch(tex_jacobian_idx, i);
+-    jacobian_point_internal<pd>(proj.x << 1, proj.y, i, rp, jxp, jyp);
+-
+-    if (scaling && pd) {
+-      jxp[0] *= sj.x;
+-      jxp[1] *= sj.y;
+-      jxp[2] *= sj.z;
+-      jyp[0] *= sj.x;
+-      jyp[1] *= sj.y;
+-      jyp[2] *= sj.z;
+-    }
+-    M00 += (jxp[0] * jxp[0] + jyp[0] * jyp[0]);
+-    M01 += (jxp[0] * jxp[1] + jyp[0] * jyp[1]);
+-    M02 += (jxp[0] * jxp[2] + jyp[0] * jyp[2]);
+-    M11 += (jxp[1] * jxp[1] + jyp[1] * jyp[1]);
+-    M12 += (jxp[1] * jxp[2] + jyp[1] * jyp[2]);
+-    M22 += (jxp[2] * jxp[2] + jyp[2] * jyp[2]);
+-  }
+-
+-  if (scaling && !pd) {
+-    sj = tex1Dfetch(tex_jacobian_sj, index + ptx);
+-    M00 *= (sj.x * sj.x);
+-    M01 *= (sj.x * sj.y);
+-    M02 *= (sj.x * sj.z);
+-    M11 *= (sj.y * sj.y);
+-    M12 *= (sj.y * sj.z);
+-    M22 *= (sj.z * sj.z);
+-  }
+-
+-  diag[index] = make_float4(M00, M11, M22, 0);
+-
+-  M00 = lambda2 * M00 + lambda1;
+-  M11 = lambda2 * M11 + lambda1;
+-  M22 = lambda2 * M22 + lambda1;
+-
+-  // invert the 3x3 matrix.
+-  float det = (M00 * M11 - M01 * M01) * M22 + 2.0 * M01 * M12 * M02 -
+-              M02 * M02 * M11 - M12 * M12 * M00;
+-  if (det >= FLT_MAX || det <= FLT_MIN * 2.0f) {
+-    int write_pos = index * 3;
+-    blocks[write_pos] = make_float4(0, 0, 0, 0);
+-    blocks[write_pos + 1] = make_float4(0, 0, 0, 0);
+-    blocks[write_pos + 2] = make_float4(0, 0, 0, 0);
+-  } else {
+-    float m00 = (M11 * M22 - M12 * M12) / det;
+-    float m01 = -(M01 * M22 - M12 * M02) / det;
+-    float m02 = (M01 * M12 - M02 * M11) / det;
+-    int write_pos = index * 3;
+-    blocks[write_pos] = make_float4(m00, m01, m02, 0);
+-
+-    float m11 = (M00 * M22 - M02 * M02) / det;
+-    float m12 = -(M00 * M12 - M01 * M02) / det;
+-    blocks[write_pos + 1] = make_float4(m01, m11, m12, 0);
+-
+-    float m22 = (M00 * M11 - M01 * M01) / det;
+-    blocks[write_pos + 2] = make_float4(m02, m12, m22, 0);
+-  }
+-}
+-
+-void ProgramCU::ComputeDiagonalBlock_(
+-    float lambda, bool dampd, CuTexImage& camera, CuTexImage& point,
+-    CuTexImage& meas, CuTexImage& cmap, CuTexImage& cmlist, CuTexImage& pmap,
+-    CuTexImage& jmap, CuTexImage& jp, CuTexImage& sj, CuTexImage& diag,
+-    CuTexImage& blocks, bool intrinsic_fixed, int radial_distortion,
+-    bool add_existing_diagc, int mode) {
+-  float lambda1 = dampd ? 0.0f : lambda;
+-  float lambda2 = dampd ? (1.0f + lambda) : 1.0f;
+-  float jfc = intrinsic_fixed ? 0.0f : 1.0f;
+-
+-  //////////////////////////////////
+-  jmap.BindTexture(tex_jacobian_idx);
+-  camera.BindTexture(tex_jacobian_cam);
+-  point.BindTexture(tex_jacobian_pts);
+-  cmap.BindTexture(tex_jtjd_cmp);
+-  cmlist.BindTexture(tex_jtjd_cmlist);
+-
+-  ////////////////////////////////////////////////////
+-  const unsigned int bsize1 = 32;
+-  const unsigned int bheight1 = 2;
+-  unsigned int ncam = camera.GetImgWidth();  // how many cameras
+-  unsigned int nblock = (ncam + bheight1 - 1) / bheight1;
+-  unsigned int bw, bh;
+-  GetBlockConfiguration(nblock, bw, bh);
+-  dim3 block1(bsize1, bheight1), grid1(bw, bh);
+-
+-  ///////////////////////////////////////////////////
+-  if (radial_distortion == -1) meas.BindTexture(tex_jacobian_meas);
+-  if (mode == 2) {
+-    // skip the camera part.
+-  } else if (sj.IsValid()) {
+-    sj.BindTexture(tex_jacobian_sj);
+-    if (radial_distortion == -1)
+-      jtjd_cam_block_noj_kernel<bheight1, true, false, true><<<grid1, block1>>>(
+-          ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(),
+-          blocks.data(), add_existing_diagc);
+-    else if (radial_distortion)
+-      jtjd_cam_block_noj_kernel<bheight1, false, true, true><<<grid1, block1>>>(
+-          ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(),
+-          blocks.data(), add_existing_diagc);
+-    else
+-      jtjd_cam_block_noj_kernel<bheight1, false, false,
+-                                true><<<grid1, block1>>>(
+-          ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(),
+-          blocks.data(), add_existing_diagc);
+-  } else {
+-    if (radial_distortion == -1)
+-      jtjd_cam_block_noj_kernel<bheight1, true, false,
+-                                false><<<grid1, block1>>>(
+-          ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(),
+-          blocks.data(), add_existing_diagc);
+-    else if (radial_distortion)
+-      jtjd_cam_block_noj_kernel<bheight1, false, true,
+-                                false><<<grid1, block1>>>(
+-          ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(),
+-          blocks.data(), add_existing_diagc);
+-    else
+-      jtjd_cam_block_noj_kernel<bheight1, false, false,
+-                                false><<<grid1, block1>>>(
+-          ncam, bw * bheight1, lambda1, lambda2, jfc, diag.data(),
+-          blocks.data(), add_existing_diagc);
+-  }
+-  CheckErrorCUDA("ComputeDiagonalBlock_<Camera>");
+-
+-  ////////////////////////////////////////////////////
+-  const unsigned int bsize2 = 64;
+-  unsigned int npoint = point.GetImgWidth();
+-  unsigned int len2 = npoint;
+-  unsigned int nblock2 = (len2 + bsize2 - 1) / bsize2;
+-  unsigned int offsetd = 2 * ncam;
+-  unsigned int offsetb = (radial_distortion ? 16 : 14) * ncam;
+-  GetBlockConfiguration(nblock2, bw, bh);
+-  dim3 grid2(bw, bh), block2(bsize2);
+-  pmap.BindTexture(tex_jtjd_pmp);
+-
+-  if (mode == 1) {
+-  } else if (jp.IsValid()) {
+-    jp.BindTexture2(tex_jtjd_jp, tex_jtjd_jp2);
+-    if (jp.GetDataSize() > MAX_TEXSIZE)
+-      jtjd_point_block_kernel<2><<<grid2, block2>>>(
+-          len2, (bw * bsize2), lambda1, lambda2,
+-          ((float4*)diag.data()) + offsetd, ((float4*)blocks.data()) + offsetb);
+-    else
+-      jtjd_point_block_kernel<1><<<grid2, block2>>>(
+-          len2, (bw * bsize2), lambda1, lambda2,
+-          ((float4*)diag.data()) + offsetd, ((float4*)blocks.data()) + offsetb);
+-  } else {
+-    if (sj.IsValid()) {
+-      sj.BindTexture(tex_jacobian_sj);
+-      if (radial_distortion && radial_distortion != -1)
+-        jtjd_point_block_noj_kernel<bsize2, true, true><<<grid2, block2>>>(
+-            len2, (bw * bsize2), lambda1, lambda2,
+-            ((float4*)diag.data()) + offsetd,
+-            ((float4*)blocks.data()) + offsetb, offsetd);
+-      else
+-        jtjd_point_block_noj_kernel<bsize2, false, true><<<grid2, block2>>>(
+-            len2, (bw * bsize2), lambda1, lambda2,
+-            ((float4*)diag.data()) + offsetd,
+-            ((float4*)blocks.data()) + offsetb, offsetd);
+-    } else {
+-      if (radial_distortion && radial_distortion != -1)
+-        jtjd_point_block_noj_kernel<bsize2, true, false><<<grid2, block2>>>(
+-            len2, (bw * bsize2), lambda1, lambda2,
+-            ((float4*)diag.data()) + offsetd,
+-            ((float4*)blocks.data()) + offsetb, 0);
+-      else
+-        jtjd_point_block_noj_kernel<bsize2, false, false><<<grid2, block2>>>(
+-            len2, (bw * bsize2), lambda1, lambda2,
+-            ((float4*)diag.data()) + offsetd,
+-            ((float4*)blocks.data()) + offsetb, 0);
+-    }
+-  }
+-  CheckErrorCUDA("ComputeDiagonalBlock_<Point>");
+-
+-  ////////////////////////////////////////////////////
+-  if (mode != 2) {
+-    const unsigned int bsize3 = JTJD_BLOCK_CAM_INVERT_KWIDTH;
+-    unsigned int len3 = ncam * 8;
+-    unsigned int nblock3 = (len3 + bsize3 - 1) / bsize3;
+-    dim3 grid3(nblock3), block3(bsize3);
+-    if (radial_distortion)
+-      jtjd_cam_block_invert_kernel<8><<<grid3, block3>>>(
+-          len3, (float4*)blocks.data());
+-    else
+-      jtjd_cam_block_invert_kernel<7><<<grid3, block3>>>(
+-          len3, (float4*)blocks.data());
+-    CheckErrorCUDA("ComputeDiagonalBlockInverse<Camera>");
+-  }
+-}
+-
+-__global__ void projection_q_kernel(int nproj, int rowsz, float2* pj) {
+-  ////////////////////////////////
+-  int tidx = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * rowsz;
+-  if (tidx >= nproj) return;
+-  int2 proj = tex1Dfetch(tex_projection_idx, tidx);
+-  float2 wq = tex1Dfetch(tex_projection_mea, tidx);
+-  ///////////////////////////////////
+-  float f1 = tex1Dfetch(tex_projection_cam, proj.x * 4).x;
+-  float r1 = tex1Dfetch(tex_projection_cam, proj.x * 4 + 3).w;
+-  float f2 = tex1Dfetch(tex_projection_cam, proj.y * 4).x;
+-  float r2 = tex1Dfetch(tex_projection_cam, proj.y * 4 + 3).w;
+-  pj[tidx] = make_float2(-wq.x * (f1 - f2), -wq.y * (r1 - r2));
+-}
+-
+-void ProgramCU::ComputeProjectionQ(CuTexImage& camera, CuTexImage& qmap,
+-                                   CuTexImage& qw, CuTexImage& proj,
+-                                   int offset) {
+-  ///////////////////////////////////////
+-  unsigned int len = qmap.GetImgWidth();
+-  unsigned int bsize = PROJECTION_FRT_KWIDTH;
+-  unsigned int nblock = (len + bsize - 1) / bsize;
+-  unsigned int bw, bh;
+-  GetBlockConfiguration(nblock, bw, bh);
+-  dim3 grid(bw, bh), block(bsize);
+-
+-  ///////////////////////////////////////////
+-  camera.BindTexture(tex_projection_cam);
+-  qmap.BindTexture(tex_projection_idx);
+-  qw.BindTexture(tex_projection_mea);
+-
+-  //////////////////////////////
+-  projection_q_kernel<<<grid, block>>>(len, bw * bsize,
+-                                       ((float2*)proj.data()) + offset);
+-}
+-
+-template <bool SJ>
+-__global__ void multiply_jqx_kernel(int num, int bwidth, float2* result) {
+-  int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
+-  if (index >= num) return;
+-  ////////////////////////////////////////////
+-  int2 proj = tex1Dfetch(tex_mjx_idx, index);
+-  float2 wq = tex1Dfetch(tex_jacobian_meas, index);
+-  int idx1 = proj.x * 2, idx2 = proj.y * 2;
+-  float x11 = tex1Dfetch(tex_mjx_x, idx1).x;
+-  float x17 = tex1Dfetch(tex_mjx_x, idx1 + 1).w;
+-  float x21 = tex1Dfetch(tex_mjx_x, idx2).x;
+-  float x27 = tex1Dfetch(tex_mjx_x, idx2 + 1).w;
+-
+-  if (SJ) {
+-    float s11 = tex1Dfetch(tex_jacobian_sj, idx1).x;
+-    float s17 = tex1Dfetch(tex_jacobian_sj, idx1 + 1).w;
+-    float s21 = tex1Dfetch(tex_jacobian_sj, idx2).x;
+-    float s27 = tex1Dfetch(tex_jacobian_sj, idx2 + 1).w;
+-    result[index] = make_float2((x11 * s11 - x21 * s21) * wq.x,
+-                                (x17 * s17 - x27 * s27) * wq.y);
+-  } else {
+-    result[index] = make_float2((x11 - x21) * wq.x, (x17 - x27) * wq.y);
+-  }
+-}
+-
+-void ProgramCU::ComputeJQX(CuTexImage& x, CuTexImage& qmap, CuTexImage& wq,
+-                           CuTexImage& sj, CuTexImage& jx, int offset) {
+-  unsigned int nproj = qmap.GetImgWidth();
+-  unsigned int len = nproj;
+-  unsigned int bsize = 64;
+-  unsigned int nblock = (len + bsize - 1) / bsize;
+-  unsigned int bw, bh;
+-
+-  /////////////////////////////
+-  qmap.BindTexture(tex_mjx_idx);
+-  x.BindTexture(tex_mjx_x);
+-  wq.BindTexture(tex_jacobian_meas);
+-
+-  ///////////////////////////////////
+-  GetBlockConfiguration(nblock, bw, bh);
+-  dim3 grid(bw, bh), block(bsize);
+-
+-  if (sj.IsValid()) {
+-    sj.BindTexture(tex_jacobian_sj);
+-    multiply_jqx_kernel<true><<<grid, block>>>(len, (bw * bsize),
+-                                               ((float2*)jx.data()) + offset);
+-  } else {
+-    multiply_jqx_kernel<false><<<grid, block>>>(len, (bw * bsize),
+-                                                ((float2*)jx.data()) + offset);
+-  }
+-}
+-
+-texture<int2, 1, cudaReadModeElementType> tex_jte_q_idx;
+-texture<float2, 1, cudaReadModeElementType> tex_jte_q_w;
+-
+-template <bool SJ>
+-__global__ void jte_cam_q_kernel(int num, int bwidth, float* jte) {
+-  // int cam = blockIdx.x * KH + threadIdx.y + blockIdx.y * rowsz ;
+-  int index = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * bwidth;
+-  if (index >= num) return;
+-  int2 indexp = tex1Dfetch(tex_jte_q_idx, index);
+-  if (indexp.x == -1) return;
+-  float2 wq = tex1Dfetch(tex_jte_q_w, index);
+-  float2 e1 = tex1Dfetch(tex_jte_pe, indexp.x);
+-  float2 e2 = tex1Dfetch(tex_jte_pe, indexp.y);
+-  int index8 = index << 3;
+-  if (SJ) {
+-    float s1 = tex1Dfetch(tex_jacobian_sj, index * 2).x;
+-    jte[index8] += s1 * wq.x * (e1.x - e2.x);
+-    float s7 = tex1Dfetch(tex_jacobian_sj, index * 2 + 1).w;
+-    jte[index8 + 7] += s7 * wq.y * (e1.y - e2.y);
+-  } else {
+-    jte[index8] += wq.x * (e1.x - e2.x);
+-    jte[index8 + 7] += wq.y * (e1.y - e2.y);
+-  }
+-}
+-
+-void ProgramCU::ComputeJQtEC(CuTexImage& pe, CuTexImage& qlist, CuTexImage& wq,
+-                             CuTexImage& sj, CuTexImage& jte) {
+-  int ncam = qlist.GetImgWidth();
+-  const int bsize = 32;
+-  int nblock = (ncam + bsize - 1) / bsize;
+-  unsigned int bw, bh;
+-  GetBlockConfiguration(nblock, bw, bh);
+-  dim3 grid(bw, bh), block(bsize);
+-
+-  pe.BindTexture(tex_jte_pe);
+-  qlist.BindTexture(tex_jte_q_idx);
+-  wq.BindTexture(tex_jte_q_w);
+-
+-  if (sj.IsValid()) {
+-    sj.BindTexture(tex_jacobian_sj);
+-    jte_cam_q_kernel<true><<<grid, block>>>(ncam, (bw * bsize), jte.data());
+-  } else {
+-    jte_cam_q_kernel<false><<<grid, block>>>(ncam, (bw * bsize), jte.data());
+-  }
+-}
+-
+-}  // namespace pba
+diff --git a/lib/PBA/ProgramCU.h b/lib/PBA/ProgramCU.h
+deleted file mode 100644
+index d3d8af609..000000000
+--- a/lib/PBA/ProgramCU.h
++++ /dev/null
+@@ -1,127 +0,0 @@
+-////////////////////////////////////////////////////////////////////////////
+-//  File:           ProgramCU.h
+-//  Author:         Changchang Wu
+-//  Description :   interface for the ProgramCU classes.
+-//                  It is basically a wrapper around all the CUDA kernels
+-//
+-//  Copyright (c) 2011  Changchang Wu (ccwu@cs.washington.edu)
+-//    and the University of Washington at Seattle
+-//
+-//  This library is free software; you can redistribute it and/or
+-//  modify it under the terms of the GNU General Public
+-//  License as published by the Free Software Foundation; either
+-//  Version 3 of the License, or (at your option) any later version.
+-//
+-//  This library is distributed in the hope that it will be useful,
+-//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+-//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-//  General Public License for more details.
+-//
+-////////////////////////////////////////////////////////////////////////////////
+-
+-#ifndef _PROGRAM_CU_H
+-#define _PROGRAM_CU_H
+-
+-class CuTexImage;
+-
+-namespace pba {
+-namespace ProgramCU {
+-
+-int SetCudaDevice(int device);
+-size_t GetCudaMemoryCap();
+-int CheckErrorCUDA(const char* location);
+-void FinishWorkCUDA();
+-void ClearPreviousError();
+-void ResetCurrentDevice();
+-void GetBlockConfiguration(unsigned int nblock, unsigned int& bw,
+-                           unsigned int& bh);
+-
+-//////////////////////////////////////////////////////////
+-void ComputeSQRT(CuTexImage& tex);
+-void ComputeRSQRT(CuTexImage& tex);
+-void ComputeVXY(CuTexImage& texX, CuTexImage& texY, CuTexImage& result,
+-                unsigned int part = 0, unsigned int skip = 0);
+-void ComputeSAXPY(float a, CuTexImage& texX, CuTexImage& texY,
+-                  CuTexImage& result);
+-void ComputeSAX(float a, CuTexImage& texX, CuTexImage& result);
+-void ComputeSXYPZ(float a, CuTexImage& texX, CuTexImage& texY, CuTexImage& texZ,
+-                  CuTexImage& result);
+-float ComputeVectorMax(CuTexImage& vector, CuTexImage& buf);
+-float ComputeVectorSum(CuTexImage& vector, CuTexImage& buf, int skip);
+-double ComputeVectorNorm(CuTexImage& vector, CuTexImage& buf);
+-double ComputeVectorNormW(CuTexImage& vector, CuTexImage& weight,
+-                          CuTexImage& buf);
+-double ComputeVectorDot(CuTexImage& vector1, CuTexImage& vector2,
+-                        CuTexImage& buf);
+-
+-//////////////////////////////////////////////////////////////////////////
+-void UncompressCamera(int ncam, CuTexImage& camera0, CuTexImage& result);
+-void CompressCamera(int ncam, CuTexImage& camera0, CuTexImage& result);
+-void UpdateCameraPoint(int ncam, CuTexImage& camera, CuTexImage& point,
+-                       CuTexImage& delta, CuTexImage& new_camera,
+-                       CuTexImage& new_point, int mode = 0);
+-
+-/////////////////////////////////////////////////////////////////////////
+-void ComputeJacobian(CuTexImage& camera, CuTexImage& point, CuTexImage& jc,
+-                     CuTexImage& jp, CuTexImage& proj_map, CuTexImage& sj,
+-                     CuTexImage& meas, CuTexImage& cmlist, bool intrinsic_fixed,
+-                     int radial_distortion, bool shuffle);
+-void ComputeProjection(CuTexImage& camera, CuTexImage& point, CuTexImage& meas,
+-                       CuTexImage& proj_map, CuTexImage& proj, int radial);
+-void ComputeProjectionX(CuTexImage& camera, CuTexImage& point, CuTexImage& meas,
+-                        CuTexImage& proj_map, CuTexImage& proj, int radial);
+-
+-bool ShuffleCameraJacobian(CuTexImage& jc, CuTexImage& map, CuTexImage& result);
+-
+-/////////////////////////////////////////////////////////////
+-void ComputeDiagonal(CuTexImage& jc, CuTexImage& cmap, CuTexImage& jp,
+-                     CuTexImage& pmap, CuTexImage& cmlist, CuTexImage& jtjd,
+-                     CuTexImage& jtjdi, bool jc_transpose, int radial,
+-                     bool add_existing_diagc);
+-void MultiplyBlockConditioner(int ncam, int npoint, CuTexImage& blocks,
+-                              CuTexImage& vector, CuTexImage& result,
+-                              int radial, int mode = 0);
+-
+-////////////////////////////////////////////////////////////////////////////////
+-void ComputeProjectionQ(CuTexImage& camera, CuTexImage& qmap, CuTexImage& qw,
+-                        CuTexImage& proj, int offset);
+-void ComputeJQX(CuTexImage& x, CuTexImage& qmap, CuTexImage& wq, CuTexImage& sj,
+-                CuTexImage& jx, int offset);
+-void ComputeJQtEC(CuTexImage& pe, CuTexImage& qlist, CuTexImage& wq,
+-                  CuTexImage& sj, CuTexImage& result);
+-void ComputeDiagonalQ(CuTexImage& qlistw, CuTexImage& sj, CuTexImage& diag);
+-
+-//////////////////////////////////////////////////////////////////////////
+-void ComputeJX(int point_offset, CuTexImage& x, CuTexImage& jc, CuTexImage& jp,
+-               CuTexImage& jmap, CuTexImage& result, int mode = 0);
+-void ComputeJtE(CuTexImage& pe, CuTexImage& jc, CuTexImage& cmap,
+-                CuTexImage& cmlist, CuTexImage& jp, CuTexImage& pmap,
+-                CuTexImage& jte, bool jc_transpose, int mode = 0);
+-void ComputeDiagonalBlock(float lambda, bool dampd, CuTexImage& jc,
+-                          CuTexImage& cmap, CuTexImage& jp, CuTexImage& pmap,
+-                          CuTexImage& cmlist, CuTexImage& diag,
+-                          CuTexImage& blocks, int radial_distortion,
+-                          bool jc_transpose, bool add_existing_diagc,
+-                          int mode = 0);
+-
+-/////////////////////////////////////////////////////////////////////
+-void ComputeJX_(CuTexImage& x, CuTexImage& jx, CuTexImage& camera,
+-                CuTexImage& point, CuTexImage& meas, CuTexImage& pjmap,
+-                bool intrinsic_fixed, int radial_distortion, int mode = 0);
+-void ComputeJtE_(CuTexImage& e, CuTexImage& jte, CuTexImage& camera,
+-                 CuTexImage& point, CuTexImage& meas, CuTexImage& cmap,
+-                 CuTexImage& cmlist, CuTexImage& pmap, CuTexImage& jmap,
+-                 CuTexImage& jp, bool intrinsic_fixed, int radial_distortion,
+-                 int mode = 0);
+-void ComputeDiagonalBlock_(float lambda, bool dampd, CuTexImage& camera,
+-                           CuTexImage& point, CuTexImage& meas,
+-                           CuTexImage& cmap, CuTexImage& cmlist,
+-                           CuTexImage& pmap, CuTexImage& jmap, CuTexImage& jp,
+-                           CuTexImage& sj, CuTexImage& diag, CuTexImage& blocks,
+-                           bool intrinsic_fixed, int radial_distortion,
+-                           bool add_existing_diagc, int mode = 0);
+-
+-}  // namespace ProgramCU
+-}  // namespace pba
+-
+-#endif
+diff --git a/lib/PBA/SparseBundleCPU.cpp b/lib/PBA/SparseBundleCPU.cpp
+deleted file mode 100644
+index b03708209..000000000
+--- a/lib/PBA/SparseBundleCPU.cpp
++++ /dev/null
+@@ -1,4369 +0,0 @@
+-////////////////////////////////////////////////////////////////////////////
+-//  File:           SparseBundleCPU.cpp
+-//  Author:         Changchang Wu
+-//  Description :   implementation of the CPU-based multicore bundle adjustment
+-//
+-//  Copyright (c) 2011  Changchang Wu (ccwu@cs.washington.edu)
+-//    and the University of Washington at Seattle
+-//
+-//  This library is free software; you can redistribute it and/or
+-//  modify it under the terms of the GNU General Public
+-//  License as published by the Free Software Foundation; either
+-//  Version 3 of the License, or (at your option) any later version.
+-//
+-//  This library is distributed in the hope that it will be useful,
+-//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+-//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-//  General Public License for more details.
+-//
+-////////////////////////////////////////////////////////////////////////////////
+-
+-#include <stdlib.h>
+-#include <vector>
+-#include <iostream>
+-#include <utility>
+-#include <algorithm>
+-#include <fstream>
+-#include <sstream>
+-#include <iomanip>
+-#include <cmath>
+-
+-using std::vector;
+-using std::cout;
+-using std::pair;
+-using std::ofstream;
+-using std::max;
+-
+-#include <math.h>
+-#include <time.h>
+-#include <float.h>
+-#include "pba.h"
+-#include "SparseBundleCPU.h"
+-
+-#if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_FAMILY_APP
+-#include <thread>
+-#endif
+-
+-//#define POINT_DATA_ALIGN4
+-#if defined(__arm__) || defined(_M_ARM) || defined(__aarch64__)
+-#undef CPUPBA_USE_SSE
+-#undef CPUPBA_USE_AVX
+-#undef POINT_DATA_ALIGN4
+-#if defined(_M_ARM) && _M_ARM >= 7 && !defined(DISABLE_CPU_NEON)
+-#include <arm_neon.h>
+-#define CPUPBA_USE_NEON
+-#elif defined(__ARM_NEON) && !defined(DISABLE_CPU_NEON)
+-#include <arm_neon.h>
+-#define CPUPBA_USE_NEON
+-#endif
+-#elif defined(__AVX__) && !defined(DISABLE_CPU_AVX)
+-#include <immintrin.h>
+-#define CPUPBA_USE_AVX
+-#undef CPUPBA_USE_SSE
+-#undef POINT_DATA_ALIGN4
+-#elif (defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86) && _M_IX86_FP >= 2)) && !defined(DISABLE_CPU_SSE)
+-#define CPUPBA_USE_SSE
+-#include <xmmintrin.h>
+-#include <emmintrin.h>
+-#endif
+-
+-#ifdef POINT_DATA_ALIGN4
+-#define POINT_ALIGN 4
+-#else
+-#define POINT_ALIGN 3
+-#endif
+-
+-#define POINT_ALIGN2 (POINT_ALIGN * 2)
+-
+-#ifdef _WIN32
+-#define NOMINMAX
+-#include <windows.h>
+-#define INLINESUFIX
+-#define finite _finite
+-#else
+-#include <pthread.h>
+-#include <sched.h>
+-#include <unistd.h>
+-#endif
+-
+-// maximum thread count
+-#define THREAD_NUM_MAX 64
+-// compute the number of threads for vector operatoins, pure heuristics...
+-#define AUTO_MT_NUM(sz) \
+-  int((log((double)sz) / log(2.0) - 18.5) * __num_cpu_cores / 16.0)
+-
+-namespace pba {
+-
+-template <class Float>
+-void avec<Float>::SaveToFile(const char* name) {
+-  ofstream out(name);
+-  for (Float* p = _data; p < _last; ++p) out << (*p) << '\n';
+-}
+-
+-#ifdef CPUPBA_USE_SSE
+-#define CPUPBA_USE_SIMD
+-namespace MYSSE {
+-template <class Float>
+-class SSE {};
+-template <>
+-class SSE<float> {
+- public:
+-  typedef __m128 sse_type;
+-  static inline sse_type zero() { return _mm_setzero_ps(); }
+-};
+-template <>
+-class SSE<double> {
+- public:
+-  typedef __m128d sse_type;
+-  static inline sse_type zero() { return _mm_setzero_pd(); }
+-};
+-
+-////////////////////////////////////////////
+-template <class Float>
+-inline size_t sse_step() {
+-  return 16 / sizeof(Float);
+-};
+-inline __m128 sse_load1(const float* p) { return _mm_load1_ps(p); }
+-inline __m128 sse_load(const float* p) { return _mm_load_ps(p); }
+-inline __m128 sse_add(__m128 s1, __m128 s2) { return _mm_add_ps(s1, s2); }
+-inline __m128 sse_sub(__m128 s1, __m128 s2) { return _mm_sub_ps(s1, s2); }
+-inline __m128 sse_mul(__m128 s1, __m128 s2) { return _mm_mul_ps(s1, s2); }
+-inline __m128 sse_sqrt(__m128 s) { return _mm_sqrt_ps(s); }
+-
+-inline __m128d sse_load1(const double* p) { return _mm_load1_pd(p); }
+-inline __m128d sse_load(const double* p) { return _mm_load_pd(p); }
+-inline __m128d sse_add(__m128d s1, __m128d s2) { return _mm_add_pd(s1, s2); }
+-inline __m128d sse_sub(__m128d s1, __m128d s2) { return _mm_sub_pd(s1, s2); }
+-inline __m128d sse_mul(__m128d s1, __m128d s2) { return _mm_mul_pd(s1, s2); }
+-inline __m128d sse_sqrt(__m128d s) { return _mm_sqrt_pd(s); }
+-
+-#ifdef _WIN32
+-inline float sse_sum(__m128 s) {
+-  return (s.m128_f32[0] + s.m128_f32[2]) + (s.m128_f32[1] + s.m128_f32[3]);
+-}
+-inline double sse_sum(__m128d s) { return s.m128d_f64[0] + s.m128d_f64[1]; }
+-#else
+-inline float sse_sum(__m128 s) {
+-  float* f = (float*)(&s);
+-  return (f[0] + f[2]) + (f[1] + f[3]);
+-}
+-inline double sse_sum(__m128d s) {
+-  double* d = (double*)(&s);
+-  return d[0] + d[1];
+-}
+-#endif
+-// inline float  sse_dot(__m128 s1, __m128 s2)  {__m128 temp = _mm_dp_ps(s1,
+-// s2, 0xF1);   float* f = (float*) (&temp); return f[0];   }
+-// inline double  sse_dot(__m128d s1, __m128d s2) {__m128d temp =
+-// _mm_dp_pd(s1, s2, 0x31);   double* f = (double*) (&temp); return f[0] ; }
+-inline void sse_store(float* p, __m128 s) { _mm_store_ps(p, s); }
+-inline void sse_store(double* p, __m128d s) { _mm_store_pd(p, s); }
+-
+-inline void data_prefetch(const void* p) {
+-  _mm_prefetch((const char*)p, _MM_HINT_NTA);
+-}
+-};
+-
+-namespace ProgramCPU {
+-using namespace MYSSE;
+-#define SSE_ZERO SSE<Float>::zero()
+-#define SSE_T typename SSE<Float>::sse_type
+-/////////////////////////////
+-inline void ScaleJ4(float* jcx, float* jcy, const float* sj) {
+-  __m128 ps = _mm_load_ps(sj);
+-  _mm_store_ps(jcx, _mm_mul_ps(_mm_load_ps(jcx), ps));
+-  _mm_store_ps(jcy, _mm_mul_ps(_mm_load_ps(jcy), ps));
+-}
+-inline void ScaleJ8(float* jcx, float* jcy, const float* sj) {
+-  ScaleJ4(jcx, jcy, sj);
+-  ScaleJ4(jcx + 4, jcy + 4, sj + 4);
+-}
+-inline void ScaleJ4(double* jcx, double* jcy, const double* sj) {
+-  __m128d ps1 = _mm_load_pd(sj), ps2 = _mm_load_pd(sj + 2);
+-  _mm_store_pd(jcx, _mm_mul_pd(_mm_load_pd(jcx), ps1));
+-  _mm_store_pd(jcy, _mm_mul_pd(_mm_load_pd(jcy), ps1));
+-  _mm_store_pd(jcx + 2, _mm_mul_pd(_mm_load_pd(jcx + 2), ps2));
+-  _mm_store_pd(jcy + 2, _mm_mul_pd(_mm_load_pd(jcy + 2), ps2));
+-}
+-inline void ScaleJ8(double* jcx, double* jcy, const double* sj) {
+-  ScaleJ4(jcx, jcy, sj);
+-  ScaleJ4(jcx + 4, jcy + 4, sj + 4);
+-}
+-inline float DotProduct8(const float* v1, const float* v2) {
+-  __m128 ds = _mm_add_ps(_mm_mul_ps(_mm_load_ps(v1), _mm_load_ps(v2)),
+-                         _mm_mul_ps(_mm_load_ps(v1 + 4), _mm_load_ps(v2 + 4)));
+-  return sse_sum(ds);
+-}
+-inline double DotProduct8(const double* v1, const double* v2) {
+-  __m128d d1 = _mm_mul_pd(_mm_load_pd(v1), _mm_load_pd(v2));
+-  __m128d d2 = _mm_mul_pd(_mm_load_pd(v1 + 2), _mm_load_pd(v2 + 2));
+-  __m128d d3 = _mm_mul_pd(_mm_load_pd(v1 + 4), _mm_load_pd(v2 + 4));
+-  __m128d d4 = _mm_mul_pd(_mm_load_pd(v1 + 6), _mm_load_pd(v2 + 6));
+-  __m128d ds = _mm_add_pd(_mm_add_pd(d1, d2), _mm_add_pd(d3, d4));
+-  return sse_sum(ds);
+-}
+-
+-inline void ComputeTwoJX(const float* jc, const float* jp, const float* xc,
+-                         const float* xp, float* jx) {
+-#ifdef POINT_DATA_ALIGN4
+-  __m128 xc1 = _mm_load_ps(xc), xc2 = _mm_load_ps(xc + 4),
+-         mxp = _mm_load_ps(xp);
+-  __m128 ds1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(jc), xc1),
+-                          _mm_mul_ps(_mm_load_ps(jc + 4), xc2));
+-  __m128 dx1 = _mm_add_ps(ds1, _mm_mul_ps(_mm_load_ps(jp), mxp));
+-  jx[0] = sse_sum(dx1);
+-  __m128 ds2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(jc + 8), xc1),
+-                          _mm_mul_ps(_mm_load_ps(jc + 12), xc2));
+-  __m128 dx2 = _mm_add_ps(ds2, _mm_mul_ps(_mm_load_ps(jp + 4), mxp));
+-  jx[1] = sse_sum(dx2);
+-#else
+-  __m128 xc1 = _mm_load_ps(xc), xc2 = _mm_load_ps(xc + 4);
+-  __m128 jc1 = _mm_load_ps(jc), jc2 = _mm_load_ps(jc + 4);
+-  __m128 jc3 = _mm_load_ps(jc + 8), jc4 = _mm_load_ps(jc + 12);
+-  __m128 ds1 = _mm_add_ps(_mm_mul_ps(jc1, xc1), _mm_mul_ps(jc2, xc2));
+-  __m128 ds2 = _mm_add_ps(_mm_mul_ps(jc3, xc1), _mm_mul_ps(jc4, xc2));
+-  jx[0] = sse_sum(ds1) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
+-  jx[1] =
+-      sse_sum(ds2) + (jp[POINT_ALIGN] * xp[0] + jp[POINT_ALIGN + 1] * xp[1] +
+-                      jp[POINT_ALIGN + 2] * xp[2]);
+-/*jx[0] = (sse_dot(jc1, xc1) + sse_dot(jc2, xc2)) + (jp[0] * xp[0] + jp[1] *
+-xp[1] + jp[2] * xp[2]);
+-jx[1] = (sse_dot(jc3, xc1) + sse_dot(jc4, xc2)) + (jp[POINT_ALIGN] * xp[0] +
+-jp[POINT_ALIGN+1] * xp[1] + jp[POINT_ALIGN+2] * xp[2]);*/
+-#endif
+-}
+-
+-inline void ComputeTwoJX(const double* jc, const double* jp, const double* xc,
+-                         const double* xp, double* jx) {
+-  __m128d xc1 = _mm_load_pd(xc), xc2 = _mm_load_pd(xc + 2),
+-          xc3 = _mm_load_pd(xc + 4), xc4 = _mm_load_pd(xc + 6);
+-  __m128d d1 = _mm_mul_pd(_mm_load_pd(jc), xc1);
+-  __m128d d2 = _mm_mul_pd(_mm_load_pd(jc + 2), xc2);
+-  __m128d d3 = _mm_mul_pd(_mm_load_pd(jc + 4), xc3);
+-  __m128d d4 = _mm_mul_pd(_mm_load_pd(jc + 6), xc4);
+-  __m128d ds1 = _mm_add_pd(_mm_add_pd(d1, d2), _mm_add_pd(d3, d4));
+-  jx[0] = sse_sum(ds1) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
+-
+-  __m128d d5 = _mm_mul_pd(_mm_load_pd(jc + 8), xc1);
+-  __m128d d6 = _mm_mul_pd(_mm_load_pd(jc + 10), xc2);
+-  __m128d d7 = _mm_mul_pd(_mm_load_pd(jc + 12), xc3);
+-  __m128d d8 = _mm_mul_pd(_mm_load_pd(jc + 14), xc4);
+-  __m128d ds2 = _mm_add_pd(_mm_add_pd(d5, d6), _mm_add_pd(d7, d8));
+-  jx[1] =
+-      sse_sum(ds2) + (jp[POINT_ALIGN] * xp[0] + jp[POINT_ALIGN + 1] * xp[1] +
+-                      jp[POINT_ALIGN + 2] * xp[2]);
+-}
+-
+-// v += ax
+-inline void AddScaledVec8(float a, const float* x, float* v) {
+-  __m128 aa = sse_load1(&a);
+-  _mm_store_ps(v, _mm_add_ps(_mm_mul_ps(_mm_load_ps(x), aa), _mm_load_ps(v)));
+-  _mm_store_ps(v + 4, _mm_add_ps(_mm_mul_ps(_mm_load_ps(x + 4), aa),
+-                                 _mm_load_ps(v + 4)));
+-}
+-// v += ax
+-inline void AddScaledVec8(double a, const double* x, double* v) {
+-  __m128d aa = sse_load1(&a);
+-  _mm_store_pd(v, _mm_add_pd(_mm_mul_pd(_mm_load_pd(x), aa), _mm_load_pd(v)));
+-  _mm_store_pd(v + 2, _mm_add_pd(_mm_mul_pd(_mm_load_pd(x + 2), aa),
+-                                 _mm_load_pd(v + 2)));
+-  _mm_store_pd(v + 4, _mm_add_pd(_mm_mul_pd(_mm_load_pd(x + 4), aa),
+-                                 _mm_load_pd(v + 4)));
+-  _mm_store_pd(v + 6, _mm_add_pd(_mm_mul_pd(_mm_load_pd(x + 6), aa),
+-                                 _mm_load_pd(v + 6)));
+-}
+-
+-inline void AddBlockJtJ(const float* jc, float* block, int vn) {
+-  __m128 j1 = _mm_load_ps(jc);
+-  __m128 j2 = _mm_load_ps(jc + 4);
+-  for (int i = 0; i < vn; ++i, ++jc, block += 8) {
+-    __m128 a = sse_load1(jc);
+-    _mm_store_ps(block + 0,
+-                 _mm_add_ps(_mm_mul_ps(a, j1), _mm_load_ps(block + 0)));
+-    _mm_store_ps(block + 4,
+-                 _mm_add_ps(_mm_mul_ps(a, j2), _mm_load_ps(block + 4)));
+-  }
+-}
+-
+-inline void AddBlockJtJ(const double* jc, double* block, int vn) {
+-  __m128d j1 = _mm_load_pd(jc);
+-  __m128d j2 = _mm_load_pd(jc + 2);
+-  __m128d j3 = _mm_load_pd(jc + 4);
+-  __m128d j4 = _mm_load_pd(jc + 6);
+-  for (int i = 0; i < vn; ++i, ++jc, block += 8) {
+-    __m128d a = sse_load1(jc);
+-    _mm_store_pd(block + 0,
+-                 _mm_add_pd(_mm_mul_pd(a, j1), _mm_load_pd(block + 0)));
+-    _mm_store_pd(block + 2,
+-                 _mm_add_pd(_mm_mul_pd(a, j2), _mm_load_pd(block + 2)));
+-    _mm_store_pd(block + 4,
+-                 _mm_add_pd(_mm_mul_pd(a, j3), _mm_load_pd(block + 4)));
+-    _mm_store_pd(block + 6,
+-                 _mm_add_pd(_mm_mul_pd(a, j4), _mm_load_pd(block + 6)));
+-  }
+-}
+-};
+-#endif
+-
+-#ifdef CPUPBA_USE_AVX
+-#define CPUPBA_USE_SIMD
+-namespace MYAVX {
+-template <class Float>
+-class SSE {};
+-template <>
+-class SSE<float> {
+- public:
+-  typedef __m256 sse_type;  // static size_t   step() {return 4;}
+-  static inline sse_type zero() { return _mm256_setzero_ps(); }
+-};
+-template <>
+-class SSE<double> {
+- public:
+-  typedef __m256d sse_type;  // static size_t   step() {return 2;}
+-  static inline sse_type zero() { return _mm256_setzero_pd(); }
+-};
+-
+-////////////////////////////////////////////
+-template <class Float>
+-inline size_t sse_step() {
+-  return 32 / sizeof(Float);
+-};
+-inline __m256 sse_load1(const float* p) { return _mm256_broadcast_ss(p); }
+-inline __m256 sse_load(const float* p) { return _mm256_load_ps(p); }
+-inline __m256 sse_add(__m256 s1, __m256 s2) { return _mm256_add_ps(s1, s2); }
+-inline __m256 sse_sub(__m256 s1, __m256 s2) { return _mm256_sub_ps(s1, s2); }
+-inline __m256 sse_mul(__m256 s1, __m256 s2) { return _mm256_mul_ps(s1, s2); }
+-inline __m256 sse_sqrt(__m256 s) { return _mm256_sqrt_ps(s); }
+-
+-// inline __m256 sse_fmad(__m256 a, __m256 b, __m256 c) {return
+-// _mm256_fmadd_ps(a, b, c);}
+-
+-inline __m256d sse_load1(const double* p) { return _mm256_broadcast_sd(p); }
+-inline __m256d sse_load(const double* p) { return _mm256_load_pd(p); }
+-inline __m256d sse_add(__m256d s1, __m256d s2) { return _mm256_add_pd(s1, s2); }
+-inline __m256d sse_sub(__m256d s1, __m256d s2) { return _mm256_sub_pd(s1, s2); }
+-inline __m256d sse_mul(__m256d s1, __m256d s2) { return _mm256_mul_pd(s1, s2); }
+-inline __m256d sse_sqrt(__m256d s) { return _mm256_sqrt_pd(s); }
+-
+-#ifdef _WIN32
+-inline float sse_sum(__m256 s) {
+-  return ((s.m256_f32[0] + s.m256_f32[4]) + (s.m256_f32[2] + s.m256_f32[6])) +
+-         ((s.m256_f32[1] + s.m256_f32[5]) + (s.m256_f32[3] + s.m256_f32[7]));
+-}
+-inline double sse_sum(__m256d s) {
+-  return (s.m256d_f64[0] + s.m256d_f64[2]) + (s.m256d_f64[1] + s.m256d_f64[3]);
+-}
+-#else
+-inline float sse_sum(__m256 s) {
+-  float* f = (float*)(&s);
+-  return ((f[0] + f[4]) + (f[2] + f[6])) + ((f[1] + f[5]) + (f[3] + f[7]));
+-}
+-inline double sse_sum(__m256d s) {
+-  double* d = (double*)(&s);
+-  return (d[0] + d[2]) + (d[1] + d[3]);
+-}
+-#endif
+-inline float sse_dot(__m256 s1, __m256 s2) {
+-  __m256 temp = _mm256_dp_ps(s1, s2, 0xf1);
+-  float* f = (float*)(&temp);
+-  return f[0] + f[4];
+-}
+-inline double sse_dot(__m256d s1, __m256d s2) {
+-  return sse_sum(sse_mul(s1, s2));
+-}
+-
+-inline void sse_store(float* p, __m256 s) { _mm256_store_ps(p, s); }
+-inline void sse_store(double* p, __m256d s) { _mm256_store_pd(p, s); }
+-
+-inline void data_prefetch(const void* p) {
+-  _mm_prefetch((const char*)p, _MM_HINT_NTA);
+-}
+-};
+-
+-namespace ProgramCPU {
+-using namespace MYAVX;
+-#define SSE_ZERO SSE<Float>::zero()
+-#define SSE_T typename SSE<Float>::sse_type
+-
+-/////////////////////////////
+-inline void ScaleJ8(float* jcx, float* jcy, const float* sj) {
+-  __m256 ps = _mm256_load_ps(sj);
+-  _mm256_store_ps(jcx, _mm256_mul_ps(_mm256_load_ps(jcx), ps));
+-  _mm256_store_ps(jcy, _mm256_mul_ps(_mm256_load_ps(jcy), ps));
+-}
+-inline void ScaleJ4(double* jcx, double* jcy, const double* sj) {
+-  __m256d ps = _mm256_load_pd(sj);
+-  _mm256_store_pd(jcx, _mm256_mul_pd(_mm256_load_pd(jcx), ps));
+-  _mm256_store_pd(jcy, _mm256_mul_pd(_mm256_load_pd(jcy), ps));
+-}
+-inline void ScaleJ8(double* jcx, double* jcy, const double* sj) {
+-  ScaleJ4(jcx, jcy, sj);
+-  ScaleJ4(jcx + 4, jcy + 4, sj + 4);
+-}
+-inline float DotProduct8(const float* v1, const float* v2) {
+-  return sse_dot(_mm256_load_ps(v1), _mm256_load_ps(v2));
+-}
+-inline double DotProduct8(const double* v1, const double* v2) {
+-  __m256d ds = _mm256_add_pd(
+-      _mm256_mul_pd(_mm256_load_pd(v1), _mm256_load_pd(v2)),
+-      _mm256_mul_pd(_mm256_load_pd(v1 + 4), _mm256_load_pd(v2 + 4)));
+-  return sse_sum(ds);
+-}
+-
+-inline void ComputeTwoJX(const float* jc, const float* jp, const float* xc,
+-                         const float* xp, float* jx) {
+-  __m256 xcm = _mm256_load_ps(xc), jc1 = _mm256_load_ps(jc),
+-         jc2 = _mm256_load_ps(jc + 8);
+-  jx[0] = sse_dot(jc1, xcm) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
+-  jx[1] = sse_dot(jc2, xcm) +
+-          (jp[POINT_ALIGN] * xp[0] + jp[POINT_ALIGN + 1] * xp[1] +
+-           jp[POINT_ALIGN + 2] * xp[2]);
+-}
+-
+-inline void ComputeTwoJX(const double* jc, const double* jp, const double* xc,
+-                         const double* xp, double* jx) {
+-  __m256d xc1 = _mm256_load_pd(xc), xc2 = _mm256_load_pd(xc + 4);
+-  __m256d jc1 = _mm256_load_pd(jc), jc2 = _mm256_load_pd(jc + 4);
+-  __m256d jc3 = _mm256_load_pd(jc + 8), jc4 = _mm256_load_pd(jc + 12);
+-  __m256d ds1 = _mm256_add_pd(_mm256_mul_pd(jc1, xc1), _mm256_mul_pd(jc2, xc2));
+-  __m256d ds2 = _mm256_add_pd(_mm256_mul_pd(jc3, xc1), _mm256_mul_pd(jc4, xc2));
+-  jx[0] = sse_sum(ds1) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
+-  jx[1] =
+-      sse_sum(ds2) + (jp[POINT_ALIGN] * xp[0] + jp[POINT_ALIGN + 1] * xp[1] +
+-                      jp[POINT_ALIGN + 2] * xp[2]);
+-}
+-
+-// v += ax
+-inline void AddScaledVec8(float a, const float* x, float* v) {
+-  __m256 aa = sse_load1(&a);
+-  _mm256_store_ps(v, _mm256_add_ps(_mm256_mul_ps(_mm256_load_ps(x), aa),
+-                                   _mm256_load_ps(v)));
+-  //_mm256_store_ps(v, _mm256_fmadd_ps(_mm256_load_ps(x), aa,
+-  //_mm256_load_ps(v)));
+-}
+-// v += ax
+-inline void AddScaledVec8(double a, const double* x, double* v) {
+-  __m256d aa = sse_load1(&a);
+-  _mm256_store_pd(v, _mm256_add_pd(_mm256_mul_pd(_mm256_load_pd(x), aa),
+-                                   _mm256_load_pd(v)));
+-  _mm256_store_pd(v + 4, _mm256_add_pd(_mm256_mul_pd(_mm256_load_pd(x + 4), aa),
+-                                       _mm256_load_pd(v + 4)));
+-}
+-
+-inline void AddBlockJtJ(const float* jc, float* block, int vn) {
+-  __m256 j = _mm256_load_ps(jc);
+-  for (int i = 0; i < vn; ++i, ++jc, block += 8) {
+-    __m256 a = sse_load1(jc);
+-    _mm256_store_ps(block,
+-                    _mm256_add_ps(_mm256_mul_ps(a, j), _mm256_load_ps(block)));
+-  }
+-}
+-
+-inline void AddBlockJtJ(const double* jc, double* block, int vn) {
+-  __m256d j1 = _mm256_load_pd(jc);
+-  __m256d j2 = _mm256_load_pd(jc + 4);
+-  for (int i = 0; i < vn; ++i, ++jc, block += 8) {
+-    __m256d a = sse_load1(jc);
+-    _mm256_store_pd(block + 0, _mm256_add_pd(_mm256_mul_pd(a, j1),
+-                                             _mm256_load_pd(block + 0)));
+-    _mm256_store_pd(block + 4, _mm256_add_pd(_mm256_mul_pd(a, j2),
+-                                             _mm256_load_pd(block + 4)));
+-  }
+-}
+-};
+-
+-#endif
+-
+-#ifdef CPUPBA_USE_NEON
+-#define CPUPBA_USE_SIMD
+-#define SIMD_NO_SQRT
+-#define SIMD_NO_DOUBLE
+-namespace MYNEON {
+-template <class Float>
+-class SSE {};
+-template <>
+-class SSE<float> {
+- public:
+-  typedef float32x4_t sse_type;
+-};
+-
+-////////////////////////////////////////////
+-template <class Float>
+-inline size_t sse_step() {
+-  return 16 / sizeof(Float);
+-};
+-inline float32x4_t sse_load1(const float* p) { return vld1q_dup_f32(p); }
+-inline float32x4_t sse_load(const float* p) { return vld1q_f32(p); }
+-inline float32x4_t sse_loadzero() {
+-  float z = 0;
+-  return sse_load1(&z);
+-}
+-inline float32x4_t sse_add(float32x4_t s1, float32x4_t s2) {
+-  return vaddq_f32(s1, s2);
+-}
+-inline float32x4_t sse_sub(float32x4_t s1, float32x4_t s2) {
+-  return vsubq_f32(s1, s2);
+-}
+-inline float32x4_t sse_mul(float32x4_t s1, float32x4_t s2) {
+-  return vmulq_f32(s1, s2);
+-}
+-// inline float32x4_t sse_sqrt(float32x4_t s)                {return
+-// _mm_sqrt_ps(s); }
+-inline float sse_sum(float32x4_t s) {
+-  float* f = (float*)(&s);
+-  return (f[0] + f[2]) + (f[1] + f[3]);
+-}
+-inline void sse_store(float* p, float32x4_t s) { vst1q_f32(p, s); }
+-inline void data_prefetch(const void* p) {}
+-};
+-namespace ProgramCPU {
+-using namespace MYNEON;
+-#define SSE_ZERO sse_loadzero()
+-#define SSE_T typename SSE<Float>::sse_type
+-/////////////////////////////
+-inline void ScaleJ4(float* jcx, float* jcy, const float* sj) {
+-  float32x4_t ps = sse_load(sj);
+-  sse_store(jcx, sse_mul(sse_load(jcx), ps));
+-  sse_store(jcy, sse_mul(sse_load(jcy), ps));
+-}
+-inline void ScaleJ8(float* jcx, float* jcy, const float* sj) {
+-  ScaleJ4(jcx, jcy, sj);
+-  ScaleJ4(jcx + 4, jcy + 4, sj + 4);
+-}
+-
+-inline float DotProduct8(const float* v1, const float* v2) {
+-  float32x4_t ds = sse_add(sse_mul(sse_load(v1), sse_load(v2)),
+-                           sse_mul(sse_load(v1 + 4), sse_load(v2 + 4)));
+-  return sse_sum(ds);
+-}
+-
+-inline void ComputeTwoJX(const float* jc, const float* jp, const float* xc,
+-                         const float* xp, float* jx) {
+-#ifdef POINT_DATA_ALIGN4
+-  float32x4_t xc1 = sse_load(xc), xc2 = sse_load(xc + 4), mxp = sse_load(xp);
+-  float32x4_t ds1 =
+-      sse_add(sse_mul(sse_load(jc), xc1), sse_mul(sse_load(jc + 4), xc2));
+-  float32x4_t dx1 = sse_add(ds1, sse_mul(sse_load(jp), mxp));
+-  jx[0] = sse_sum(dx1);
+-  float32x4_t ds2 =
+-      sse_add(sse_mul(sse_load(jc + 8), xc1), sse_mul(sse_load(jc + 12), xc2));
+-  float32x4_t dx2 = sse_add(ds2, sse_mul(sse_load(jp + 4), mxp));
+-  jx[1] = sse_sum(dx2);
+-#else
+-  float32x4_t xc1 = sse_load(xc), xc2 = sse_load(xc + 4);
+-  float32x4_t jc1 = sse_load(jc), jc2 = sse_load(jc + 4);
+-  float32x4_t jc3 = sse_load(jc + 8), jc4 = sse_load(jc + 12);
+-  float32x4_t ds1 = sse_add(sse_mul(jc1, xc1), sse_mul(jc2, xc2));
+-  float32x4_t ds2 = sse_add(sse_mul(jc3, xc1), sse_mul(jc4, xc2));
+-  jx[0] = sse_sum(ds1) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
+-  jx[1] =
+-      sse_sum(ds2) + (jp[POINT_ALIGN] * xp[0] + jp[POINT_ALIGN + 1] * xp[1] +
+-                      jp[POINT_ALIGN + 2] * xp[2]);
+-/*jx[0] = (sse_dot(jc1, xc1) + sse_dot(jc2, xc2)) + (jp[0] * xp[0] + jp[1] *
+-xp[1] + jp[2] * xp[2]);
+-jx[1] = (sse_dot(jc3, xc1) + sse_dot(jc4, xc2)) + (jp[POINT_ALIGN] * xp[0] +
+-jp[POINT_ALIGN+1] * xp[1] + jp[POINT_ALIGN+2] * xp[2]);*/
+-#endif
+-}
+-
+-// v += ax
+-inline void AddScaledVec8(float a, const float* x, float* v) {
+-  float32x4_t aa = sse_load1(&a);
+-  sse_store(v, sse_add(sse_mul(sse_load(x), aa), sse_load(v)));
+-  sse_store(v + 4, sse_add(sse_mul(sse_load(x + 4), aa), sse_load(v + 4)));
+-}
+-
+-inline void AddBlockJtJ(const float* jc, float* block, int vn) {
+-  float32x4_t j1 = sse_load(jc);
+-  float32x4_t j2 = sse_load(jc + 4);
+-  for (int i = 0; i < vn; ++i, ++jc, block += 8) {
+-    float32x4_t a = sse_load1(jc);
+-    sse_store(block + 0, sse_add(sse_mul(a, j1), sse_load(block + 0)));
+-    sse_store(block + 4, sse_add(sse_mul(a, j2), sse_load(block + 4)));
+-  }
+-}
+-};
+-#endif
+-
+-namespace ProgramCPU {
+-int __num_cpu_cores = 0;
+-template <class Float>
+-double ComputeVectorNorm(const avec<Float>& vec, int mt = 0);
+-
+-#if defined(CPUPBA_USE_SIMD)
+-template <class Float>
+-void ComputeSQRT(avec<Float>& vec) {
+-#ifndef SIMD_NO_SQRT
+-  const size_t step = sse_step<Float>();
+-  Float *p = &vec[0], *pe = p + vec.size(), *pex = pe - step;
+-  for (; p <= pex; p += step) sse_store(p, sse_sqrt(sse_load(p)));
+-  for (; p < pe; ++p) p[0] = sqrt(p[0]);
+-#else
+-  for (Float* it = vec.begin(); it < vec.end(); ++it) *it = sqrt(*it);
+-#endif
+-}
+-
+-template <class Float>
+-void ComputeRSQRT(avec<Float>& vec) {
+-  Float *p = &vec[0], *pe = p + vec.size();
+-  for (; p < pe; ++p) p[0] = (p[0] == 0 ? 0 : Float(1.0) / p[0]);
+-  ComputeSQRT(vec);
+-}
+-
+-template <class Float>
+-void SetVectorZero(Float* p, Float* pe) {
+-  SSE_T sse = SSE_ZERO;
+-  const size_t step = sse_step<Float>();
+-  Float* pex = pe - step;
+-  for (; p <= pex; p += step) sse_store(p, sse);
+-  for (; p < pe; ++p) *p = 0;
+-}
+-
+-template <class Float>
+-void SetVectorZero(avec<Float>& vec) {
+-  Float *p = &vec[0], *pe = p + vec.size();
+-  SetVectorZero(p, pe);
+-}
+-
+-// function not used
+-template <class Float>
+-inline void MemoryCopyA(const Float* p, const Float* pe, Float* d) {
+-  const size_t step = sse_step<Float>();
+-  const Float* pex = pe - step;
+-  for (; p <= pex; p += step, d += step) sse_store(d, sse_load(p));
+-  // while(p < pe) *d++ = *p++;
+-}
+-
+-template <class Float>
+-void ComputeVectorNorm(const Float* p, const Float* pe, double* psum) {
+-  SSE_T sse = SSE_ZERO;
+-  const size_t step = sse_step<Float>();
+-  const Float* pex = pe - step;
+-  for (; p <= pex; p += step) {
+-    SSE_T ps = sse_load(p);
+-    sse = sse_add(sse, sse_mul(ps, ps));
+-  }
+-  double sum = sse_sum(sse);
+-  for (; p < pe; ++p) sum += p[0] * p[0];
+-  *psum = sum;
+-}
+-
+-template <class Float>
+-double ComputeVectorNormW(const avec<Float>& vec, const avec<Float>& weight) {
+-  if (weight.begin() != NULL) {
+-    SSE_T sse = SSE_ZERO;
+-    const size_t step = sse_step<Float>();
+-    const Float *p = vec, *pe = p + vec.size(), *pex = pe - step;
+-    const Float* w = weight;
+-    for (; p <= pex; p += step, w += step) {
+-      SSE_T pw = sse_load(w), ps = sse_load(p);
+-      sse = sse_add(sse, sse_mul(sse_mul(ps, pw), ps));
+-    }
+-    double sum = sse_sum(sse);
+-    for (; p < pe; ++p, ++w) sum += p[0] * w[0] * p[0];
+-    return sum;
+-  } else {
+-    return ComputeVectorNorm<Float>(vec, 0);
+-  }
+-}
+-
+-template <class Float>
+-double ComputeVectorDot(const avec<Float>& vec1, const avec<Float>& vec2) {
+-  SSE_T sse = SSE_ZERO;
+-  const size_t step = sse_step<Float>();
+-  const Float *p1 = vec1, *pe = p1 + vec1.size(), *pex = pe - step;
+-  const Float* p2 = vec2;
+-  for (; p1 <= pex; p1 += step, p2 += step) {
+-    SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2);
+-    sse = sse_add(sse, sse_mul(ps1, ps2));
+-  }
+-  double sum = sse_sum(sse);
+-  for (; p1 < pe; ++p1, ++p2) sum += p1[0] * p2[0];
+-  return sum;
+-}
+-
+-template <class Float>
+-void ComputeVXY(const avec<Float>& vec1, const avec<Float>& vec2,
+-                avec<Float>& result, size_t part = 0, size_t skip = 0) {
+-  const size_t step = sse_step<Float>();
+-  const Float *p1 = vec1 + skip, *pe = p1 + (part ? part : vec1.size()),
+-              *pex = pe - step;
+-  const Float* p2 = vec2 + skip;
+-  Float* p3 = result + skip;
+-  for (; p1 <= pex; p1 += step, p2 += step, p3 += step) {
+-    SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2);
+-    sse_store(p3, sse_mul(ps1, ps2));
+-  }
+-  for (; p1 < pe; ++p1, ++p2, ++p3) *p3 = p1[0] * p2[0];
+-}
+-
+-template <class Float>
+-void ComputeSAXPY(Float a, const Float* p1, const Float* p2, Float* p3,
+-                  Float* pe) {
+-  const size_t step = sse_step<Float>();
+-  SSE_T aa = sse_load1(&a);
+-  Float* pex = pe - step;
+-  if (a == 1.0f) {
+-    for (; p3 <= pex; p1 += step, p2 += step, p3 += step) {
+-      SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2);
+-      sse_store(p3, sse_add(ps2, ps1));
+-    }
+-  } else if (a == -1.0f) {
+-    for (; p3 <= pex; p1 += step, p2 += step, p3 += step) {
+-      SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2);
+-      sse_store(p3, sse_sub(ps2, ps1));
+-    }
+-  } else {
+-    for (; p3 <= pex; p1 += step, p2 += step, p3 += step) {
+-      SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2);
+-      sse_store(p3, sse_add(ps2, sse_mul(ps1, aa)));
+-    }
+-  }
+-  for (; p3 < pe; ++p1, ++p2, ++p3) p3[0] = a * p1[0] + p2[0];
+-}
+-
+-template <class Float>
+-void ComputeSAX(Float a, const avec<Float>& vec1, avec<Float>& result) {
+-  const size_t step = sse_step<Float>();
+-  SSE_T aa = sse_load1(&a);
+-  const Float *p1 = vec1, *pe = p1 + vec1.size(), *pex = pe - step;
+-  Float* p3 = result;
+-  for (; p1 <= pex; p1 += step, p3 += step) {
+-    sse_store(p3, sse_mul(sse_load(p1), aa));
+-  }
+-  for (; p1 < pe; ++p1, ++p3) p3[0] = a * p1[0];
+-}
+-
+-template <class Float>
+-inline void ComputeSXYPZ(Float a, const Float* p1, const Float* p2,
+-                         const Float* p3, Float* p4, Float* pe) {
+-  const size_t step = sse_step<Float>();
+-  SSE_T aa = sse_load1(&a);
+-  Float* pex = pe - step;
+-  for (; p4 <= pex; p1 += step, p2 += step, p3 += step, p4 += step) {
+-    SSE_T ps1 = sse_load(p1), ps2 = sse_load(p2), ps3 = sse_load(p3);
+-    sse_store(p4, sse_add(ps3, sse_mul(sse_mul(ps1, aa), ps2)));
+-  }
+-  for (; p4 < pe; ++p1, ++p2, ++p3, ++p4) p4[0] = a * p1[0] * p2[0] + p3[0];
+-}
+-
+-#else
+-template <class Float>
+-void ComputeSQRT(avec<Float>& vec) {
+-  Float* it = vec.begin();
+-  for (; it < vec.end(); ++it) {
+-    *it = sqrt(*it);
+-  }
+-}
+-template <class Float>
+-void ComputeRSQRT(avec<Float>& vec) {
+-  Float* it = vec.begin();
+-  for (; it < vec.end(); ++it) {
+-    *it = (*it == 0 ? 0 : Float(1.0) / sqrt(*it));
+-  }
+-}
+-template <class Float>
+-inline void SetVectorZero(Float* p, Float* pe) {
+-  std::fill(p, pe, 0);
+-}
+-template <class Float>
+-inline void SetVectorZero(avec<Float>& vec) {
+-  std::fill(vec.begin(), vec.end(), 0);
+-}
+-
+-template <class Float>
+-inline void MemoryCopyA(const Float* p, const Float* pe, Float* d) {
+-  while (p < pe) *d++ = *p++;
+-}
+-
+-template <class Float>
+-double ComputeVectorNormW(const avec<Float>& vec, const avec<Float>& weight) {
+-  double sum = 0;
+-  const Float *it1 = vec.begin(), *it2 = weight.begin();
+-  for (; it1 < vec.end(); ++it1, ++it2) {
+-    sum += (*it1) * (*it2) * (*it1);
+-  }
+-  return sum;
+-}
+-
+-template <class Float>
+-double ComputeVectorDot(const avec<Float>& vec1, const avec<Float>& vec2) {
+-  double sum = 0;
+-  const Float *it1 = vec1.begin(), *it2 = vec2.begin();
+-  for (; it1 < vec1.end(); ++it1, ++it2) {
+-    sum += (*it1) * (*it2);
+-  }
+-  return sum;
+-}
+-template <class Float>
+-void ComputeVectorNorm(const Float* p, const Float* pe, double* psum) {
+-  double sum = 0;
+-  for (; p < pe; ++p) sum += (*p) * (*p);
+-  *psum = sum;
+-}
+-template <class Float>
+-inline void ComputeVXY(const avec<Float>& vec1, const avec<Float>& vec2,
+-                       avec<Float>& result, size_t part = 0, size_t skip = 0) {
+-  const Float *it1 = vec1.begin() + skip, *it2 = vec2.begin() + skip;
+-  const Float* ite = part ? (it1 + part) : vec1.end();
+-  Float* it3 = result.begin() + skip;
+-  for (; it1 < ite; ++it1, ++it2, ++it3) {
+-    (*it3) = (*it1) * (*it2);
+-  }
+-}
+-template <class Float>
+-void ScaleJ8(Float* jcx, Float* jcy, const Float* sj) {
+-  for (int i = 0; i < 8; ++i) {
+-    jcx[i] *= sj[i];
+-    jcy[i] *= sj[i];
+-  }
+-}
+-
+-template <class Float>
+-inline void AddScaledVec8(Float a, const Float* x, Float* v) {
+-  for (int i = 0; i < 8; ++i) v[i] += (a * x[i]);
+-}
+-
+-template <class Float>
+-void ComputeSAX(Float a, const avec<Float>& vec1, avec<Float>& result) {
+-  const Float* it1 = vec1.begin();
+-  Float* it3 = result.begin();
+-  for (; it1 < vec1.end(); ++it1, ++it3) {
+-    (*it3) = (a * (*it1));
+-  }
+-}
+-
+-template <class Float>
+-inline void ComputeSXYPZ(Float a, const Float* p1, const Float* p2,
+-                         const Float* p3, Float* p4, Float* pe) {
+-  for (; p4 < pe; ++p1, ++p2, ++p3, ++p4) *p4 = (a * (*p1) * (*p2) + (*p3));
+-}
+-
+-template <class Float>
+-void ComputeSAXPY(Float a, const Float* it1, const Float* it2, Float* it3,
+-                  Float* ite) {
+-  if (a == (Float)1.0) {
+-    for (; it3 < ite; ++it1, ++it2, ++it3) {
+-      (*it3) = ((*it1) + (*it2));
+-    }
+-  } else {
+-    for (; it3 < ite; ++it1, ++it2, ++it3) {
+-      (*it3) = (a * (*it1) + (*it2));
+-    }
+-  }
+-}
+-template <class Float>
+-void AddBlockJtJ(const Float* jc, Float* block, int vn) {
+-  for (int i = 0; i < vn; ++i) {
+-    Float *row = block + i * 8, a = jc[i];
+-    for (int j = 0; j < vn; ++j) row[j] += a * jc[j];
+-  }
+-}
+-#endif
+-
+-#ifdef _WIN32
+-#define DEFINE_THREAD_DATA(X) \
+-  template <class Float>      \
+-  struct X##_STRUCT {
+-#define DECLEAR_THREAD_DATA(X, ...)        \
+-  X##_STRUCT<Float> tdata = {__VA_ARGS__}; \
+-  X##_STRUCT<Float>* newdata = new X##_STRUCT<Float>(tdata)
+-#define BEGIN_THREAD_PROC(X) \
+-  }                          \
+-  ;                          \
+-  template <class Float>     \
+-  DWORD X##_PROC(X##_STRUCT<Float>* q) {
+-#define END_THREAD_RPOC(X) \
+-  delete q;                \
+-  return 0;                \
+-  }
+-
+-#if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_FAMILY_APP
+-#define MYTHREAD std::thread
+-#define RUN_THREAD(X, t, ...)          \
+-  DECLEAR_THREAD_DATA(X, __VA_ARGS__); \
+-  t = std::thread(X##_PROC<Float>, newdata)
+-#define WAIT_THREAD(tv, n)                               \
+-  {                                                      \
+-    for (size_t i = 0; i < size_t(n); ++i) tv[i].join(); \
+-  }
+-#else
+-#define MYTHREAD HANDLE
+-#define RUN_THREAD(X, t, ...)                                                 \
+-  DECLEAR_THREAD_DATA(X, __VA_ARGS__);                                        \
+-  t = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)X##_PROC<Float>, newdata, \
+-                   0, 0)
+-#define WAIT_THREAD(tv, n)                                     \
+-  {                                                            \
+-    WaitForMultipleObjects((DWORD)n, tv, TRUE, INFINITE);      \
+-    for (size_t i = 0; i < size_t(n); ++i) CloseHandle(tv[i]); \
+-  }
+-#endif
+-#else
+-#define DEFINE_THREAD_DATA(X) \
+-  template <class Float>      \
+-  struct X##_STRUCT {         \
+-    int tid;
+-#define DECLEAR_THREAD_DATA(X, ...)           \
+-  X##_STRUCT<Float> tdata = {i, __VA_ARGS__}; \
+-  X##_STRUCT<Float>* newdata = new X##_STRUCT<Float>(tdata)
+-#define BEGIN_THREAD_PROC(X) \
+-  }                          \
+-  ;                          \
+-  template <class Float>     \
+-  void* X##_PROC(X##_STRUCT<Float>* q) {
+-//                                 cpu_set_t mask;        CPU_ZERO( &mask );
+-//                                 CPU_SET( q->tid, &mask );
+-//                                 if( sched_setaffinity(0, sizeof(mask), &mask
+-//                                 ) == -1 )
+-//                                     std::cout <<"WARNING: Could not set CPU
+-//                                     Affinity, continuing...\n";
+-#define END_THREAD_RPOC(X)                                \
+-  delete q;                                               \
+-  return 0;                                               \
+-  }                                                       \
+-  template <class Float>                                  \
+-  struct X##_FUNCTOR {                                    \
+-    typedef void* (*func_type)(X##_STRUCT<Float>*);       \
+-    static func_type get() { return &(X##_PROC<Float>); } \
+-  };
+-#define MYTHREAD pthread_t
+-
+-#define RUN_THREAD(X, t, ...)          \
+-  DECLEAR_THREAD_DATA(X, __VA_ARGS__); \
+-  pthread_create(&t, NULL, (void* (*)(void*))X##_FUNCTOR<Float>::get(), newdata)
+-#define WAIT_THREAD(tv, n)                                            \
+-  {                                                                   \
+-    for (size_t i = 0; i < size_t(n); ++i) pthread_join(tv[i], NULL); \
+-  }
+-#endif
+-template <class Float>
+-inline void MemoryCopyB(const Float* p, const Float* pe, Float* d) {
+-  while (p < pe) *d++ = *p++;
+-}
+-
+-template <class Float>
+-inline Float DotProduct8(const Float* v1, const Float* v2) {
+-  return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2] + v1[3] * v2[3] +
+-         v1[4] * v2[4] + v1[5] * v2[5] + v1[6] * v2[6] + v1[7] * v2[7];
+-}
+-template <class Float>
+-inline void ComputeTwoJX(const Float* jc, const Float* jp, const Float* xc,
+-                         const Float* xp, Float* jx) {
+-  jx[0] = DotProduct8(jc, xc) + (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
+-  jx[1] =
+-      DotProduct8(jc + 8, xc) + (jp[3] * xp[0] + jp[4] * xp[1] + jp[5] * xp[2]);
+-}
+-template <class Float>
+-Float ComputeVectorMax(const avec<Float>& vec) {
+-  Float v = 0;
+-  const Float* it = vec.begin();
+-  for (; it < vec.end(); ++it) {
+-    Float vi = (Float)fabs(*it);
+-    v = std::max(v, vi);
+-  }
+-  return v;
+-}
+-
+-template <class Float>
+-void ComputeSXYPZ(Float a, const avec<Float>& vec1, const avec<Float>& vec2,
+-                  const avec<Float>& vec3, avec<Float>& result) {
+-  if (vec1.begin() != NULL) {
+-    const Float *p1 = &vec1[0], *p2 = &vec2[0], *p3 = &vec3[0];
+-    Float *p4 = &result[0], *pe = p4 + result.size();
+-    ComputeSXYPZ(a, p1, p2, p3, p4, pe);
+-
+-  } else {
+-    // ComputeSAXPY<Float>(a, vec2, vec3, result, 0);
+-    ComputeSAXPY<Float>(a, vec2.begin(), vec3.begin(), result.begin(),
+-                        result.end());
+-  }
+-}
+-
+-DEFINE_THREAD_DATA(ComputeSAXPY)
+-Float a;
+-const Float *p1, *p2;
+-Float *p3, *pe;
+-BEGIN_THREAD_PROC(ComputeSAXPY)
+-ComputeSAXPY(q->a, q->p1, q->p2, q->p3, q->pe);
+-END_THREAD_RPOC(ComputeSAXPY)
+-
+-template <class Float>
+-void ComputeSAXPY(Float a, const avec<Float>& vec1, const avec<Float>& vec2,
+-                  avec<Float>& result, int mt = 0) {
+-  const bool auto_multi_thread = true;
+-  if (auto_multi_thread && mt == 0) {
+-    mt = AUTO_MT_NUM(result.size() * 2);
+-  }
+-  if (mt > 1 && result.size() >= mt * 4) {
+-    MYTHREAD threads[THREAD_NUM_MAX];
+-    const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
+-    const Float *p1 = vec1.begin(), *p2 = vec2.begin();
+-    Float* p3 = result.begin();
+-    for (size_t i = 0; i < thread_num; ++i) {
+-      size_t first = (result.size() * i / thread_num + FLOAT_ALIGN - 1) /
+-                     FLOAT_ALIGN * FLOAT_ALIGN;
+-      size_t last_ = (result.size() * (i + 1) / thread_num + FLOAT_ALIGN - 1) /
+-                     FLOAT_ALIGN * FLOAT_ALIGN;
+-      size_t last = std::min(last_, result.size());
+-      RUN_THREAD(ComputeSAXPY, threads[i], a, p1 + first, p2 + first,
+-                 p3 + first, p3 + last);
+-    }
+-    WAIT_THREAD(threads, thread_num);
+-  } else {
+-    ComputeSAXPY(a, vec1.begin(), vec2.begin(), result.begin(), result.end());
+-  }
+-}
+-
+-DEFINE_THREAD_DATA(ComputeVectorNorm)
+-const Float *p, *pe;
+-double* sum;
+-BEGIN_THREAD_PROC(ComputeVectorNorm)
+-ComputeVectorNorm(q->p, q->pe, q->sum);
+-END_THREAD_RPOC(ComputeVectorNorm)
+-
+-template <class Float>
+-double ComputeVectorNorm(const avec<Float>& vec, int mt) {
+-  const bool auto_multi_thread = true;
+-  if (auto_multi_thread && mt == 0) {
+-    mt = AUTO_MT_NUM(vec.size());
+-  }
+-  if (mt > 1 && vec.size() >= mt * 4) {
+-    MYTHREAD threads[THREAD_NUM_MAX];
+-    double sumv[THREAD_NUM_MAX];
+-    const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
+-    const Float* p = vec;
+-    for (size_t i = 0; i < thread_num; ++i) {
+-      size_t first = (vec.size() * i / thread_num + FLOAT_ALIGN - 1) /
+-                     FLOAT_ALIGN * FLOAT_ALIGN;
+-      size_t last_ = (vec.size() * (i + 1) / thread_num + FLOAT_ALIGN - 1) /
+-                     FLOAT_ALIGN * FLOAT_ALIGN;
+-      size_t last = std::min(last_, vec.size());
+-      RUN_THREAD(ComputeVectorNorm, threads[i], p + first, p + last, sumv + i);
+-    }
+-    WAIT_THREAD(threads, thread_num);
+-    double sum = 0;
+-    for (size_t i = 0; i < thread_num; ++i) sum += sumv[i];
+-    return sum;
+-  } else {
+-    double sum;
+-    ComputeVectorNorm(vec.begin(), vec.end(), &sum);
+-    return sum;
+-  }
+-}
+-
+-template <class Float>
+-void GetRodriguesRotation(const Float m[3][3], Float r[3]) {
+-  // http://www.euclideanspace.com/maths/geometry/rotations/conversions/matrixToAngle/index.htm
+-  double a = (m[0][0] + m[1][1] + m[2][2] - 1.0) / 2.0;
+-  const double epsilon = 0.01;
+-  if (fabs(m[0][1] - m[1][0]) < epsilon && fabs(m[1][2] - m[2][1]) < epsilon &&
+-      fabs(m[0][2] - m[2][0]) < epsilon) {
+-    if (fabs(m[0][1] + m[1][0]) < 0.1 && fabs(m[1][2] + m[2][1]) < 0.1 &&
+-        fabs(m[0][2] + m[2][0]) < 0.1 && a > 0.9) {
+-      r[0] = 0;
+-      r[1] = 0;
+-      r[2] = 0;
+-    } else {
+-      const Float ha = Float(sqrt(0.5) * 3.14159265358979323846);
+-      double xx = (m[0][0] + 1.0) / 2.0;
+-      double yy = (m[1][1] + 1.0) / 2.0;
+-      double zz = (m[2][2] + 1.0) / 2.0;
+-      double xy = (m[0][1] + m[1][0]) / 4.0;
+-      double xz = (m[0][2] + m[2][0]) / 4.0;
+-      double yz = (m[1][2] + m[2][1]) / 4.0;
+-
+-      if ((xx > yy) && (xx > zz)) {
+-        if (xx < epsilon) {
+-          r[0] = 0;
+-          r[1] = r[2] = ha;
+-        } else {
+-          double t = sqrt(xx);
+-          r[0] = Float(t * 3.14159265358979323846);
+-          r[1] = Float(xy / t * 3.14159265358979323846);
+-          r[2] = Float(xz / t * 3.14159265358979323846);
+-        }
+-      } else if (yy > zz) {
+-        if (yy < epsilon) {
+-          r[0] = r[2] = ha;
+-          r[1] = 0;
+-        } else {
+-          double t = sqrt(yy);
+-          r[0] = Float(xy / t * 3.14159265358979323846);
+-          r[1] = Float(t * 3.14159265358979323846);
+-          r[2] = Float(yz / t * 3.14159265358979323846);
+-        }
+-      } else {
+-        if (zz < epsilon) {
+-          r[0] = r[1] = ha;
+-          r[2] = 0;
+-        } else {
+-          double t = sqrt(zz);
+-          r[0] = Float(xz / t * 3.14159265358979323846);
+-          r[1] = Float(yz / t * 3.14159265358979323846);
+-          r[2] = Float(t * 3.14159265358979323846);
+-        }
+-      }
+-    }
+-  } else {
+-    a = acos(a);
+-    double b = 0.5 * a / sin(a);
+-    r[0] = Float(b * (m[2][1] - m[1][2]));
+-    r[1] = Float(b * (m[0][2] - m[2][0]));
+-    r[2] = Float(b * (m[1][0] - m[0][1]));
+-  }
+-}
+-template <class Float>
+-void UncompressRodriguesRotation(const Float r[3], Float m[]) {
+-  double a = sqrt(r[0] * r[0] + r[1] * r[1] + r[2] * r[2]);
+-  double ct = a == 0.0 ? 0.5f : (1.0f - cos(a)) / a / a;
+-  double st = a == 0.0 ? 1 : sin(a) / a;
+-  m[0] = Float(1.0 - (r[1] * r[1] + r[2] * r[2]) * ct);
+-  m[1] = Float(r[0] * r[1] * ct - r[2] * st);
+-  m[2] = Float(r[2] * r[0] * ct + r[1] * st);
+-  m[3] = Float(r[0] * r[1] * ct + r[2] * st);
+-  m[4] = Float(1.0f - (r[2] * r[2] + r[0] * r[0]) * ct);
+-  m[5] = Float(r[1] * r[2] * ct - r[0] * st);
+-  m[6] = Float(r[2] * r[0] * ct - r[1] * st);
+-  m[7] = Float(r[1] * r[2] * ct + r[0] * st);
+-  m[8] = Float(1.0 - (r[0] * r[0] + r[1] * r[1]) * ct);
+-}
+-template <class Float>
+-void UpdateCamera(int ncam, const avec<Float>& camera, const avec<Float>& delta,
+-                  avec<Float>& new_camera) {
+-  const Float *c = &camera[0], *d = &delta[0];
+-  Float *nc = &new_camera[0], m[9];
+-  // f[1], t[3], r[3][3], d[1]
+-  for (int i = 0; i < ncam; ++i, c += 16, d += 8, nc += 16) {
+-    nc[0] = max(c[0] + d[0], ((Float)1e-10));
+-    nc[1] = c[1] + d[1];
+-    nc[2] = c[2] + d[2];
+-    nc[3] = c[3] + d[3];
+-    nc[13] = c[13] + d[7];
+-
+-    ////////////////////////////////////////////////////
+-    UncompressRodriguesRotation(d + 4, m);
+-    nc[4] = m[0] * c[4 + 0] + m[1] * c[4 + 3] + m[2] * c[4 + 6];
+-    nc[5] = m[0] * c[4 + 1] + m[1] * c[4 + 4] + m[2] * c[4 + 7];
+-    nc[6] = m[0] * c[4 + 2] + m[1] * c[4 + 5] + m[2] * c[4 + 8];
+-    nc[7] = m[3] * c[4 + 0] + m[4] * c[4 + 3] + m[5] * c[4 + 6];
+-    nc[8] = m[3] * c[4 + 1] + m[4] * c[4 + 4] + m[5] * c[4 + 7];
+-    nc[9] = m[3] * c[4 + 2] + m[4] * c[4 + 5] + m[5] * c[4 + 8];
+-    nc[10] = m[6] * c[4 + 0] + m[7] * c[4 + 3] + m[8] * c[4 + 6];
+-    nc[11] = m[6] * c[4 + 1] + m[7] * c[4 + 4] + m[8] * c[4 + 7];
+-    nc[12] = m[6] * c[4 + 2] + m[7] * c[4 + 5] + m[8] * c[4 + 8];
+-
+-    // Float temp[3];
+-    // GetRodriguesRotation((Float (*)[3])  (nc + 4), temp);
+-    // UncompressRodriguesRotation(temp, nc + 4);
+-    nc[14] = c[14];
+-    nc[15] = c[15];
+-  }
+-}
+-
+-template <class Float>
+-void UpdateCameraPoint(int ncam, const avec<Float>& camera,
+-                       const avec<Float>& point, avec<Float>& delta,
+-                       avec<Float>& new_camera, avec<Float>& new_point,
+-                       int mode, int mt) {
+-  ////////////////////////////
+-  if (mode != 2) {
+-    UpdateCamera(ncam, camera, delta, new_camera);
+-  }
+-  /////////////////////////////
+-  if (mode != 1) {
+-    avec<Float> dp;
+-    dp.set(delta.begin() + 8 * ncam, point.size());
+-    ComputeSAXPY((Float)1.0, dp, point, new_point, mt);
+-  }
+-}
+-
+-template <class Float>
+-void ComputeProjection(size_t nproj, const Float* camera, const Float* point,
+-                       const Float* ms, const int* jmap, Float* pj, int radial,
+-                       int mt);
+-
+-DEFINE_THREAD_DATA(ComputeProjection)
+-size_t nproj;
+-const Float *camera, *point, *ms;
+-const int* jmap;
+-Float* pj;
+-int radial_distortion;
+-BEGIN_THREAD_PROC(ComputeProjection)
+-ComputeProjection(q->nproj, q->camera, q->point, q->ms, q->jmap, q->pj,
+-                  q->radial_distortion, 0);
+-END_THREAD_RPOC(ComputeProjection)
+-
+-template <class Float>
+-void ComputeProjection(size_t nproj, const Float* camera, const Float* point,
+-                       const Float* ms, const int* jmap, Float* pj, int radial,
+-                       int mt) {
+-  if (mt > 1 && nproj >= mt) {
+-    MYTHREAD threads[THREAD_NUM_MAX];
+-    const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
+-    for (size_t i = 0; i < thread_num; ++i) {
+-      size_t first = nproj * i / thread_num;
+-      size_t last_ = nproj * (i + 1) / thread_num;
+-      size_t last = std::min(last_, nproj);
+-      RUN_THREAD(ComputeProjection, threads[i], last - first, camera, point,
+-                 ms + 2 * first, jmap + 2 * first, pj + 2 * first, radial);
+-    }
+-    WAIT_THREAD(threads, thread_num);
+-
+-  } else {
+-    for (size_t i = 0; i < nproj; ++i, jmap += 2, ms += 2, pj += 2) {
+-      const Float* c = camera + jmap[0] * 16;
+-      const Float* m = point + jmap[1] * POINT_ALIGN;
+-      /////////////////////////////////////////////////////
+-      Float p0 = c[4] * m[0] + c[5] * m[1] + c[6] * m[2] + c[1];
+-      Float p1 = c[7] * m[0] + c[8] * m[1] + c[9] * m[2] + c[2];
+-      Float p2 = c[10] * m[0] + c[11] * m[1] + c[12] * m[2] + c[3];
+-
+-      if (radial == 1) {
+-        Float rr = Float(1.0) + c[13] * (p0 * p0 + p1 * p1) / (p2 * p2);
+-        Float f_p2 = c[0] * rr / p2;
+-        pj[0] = ms[0] - p0 * f_p2;
+-        pj[1] = ms[1] - p1 * f_p2;
+-      } else if (radial == -1) {
+-        Float f_p2 = c[0] / p2;
+-        Float rd = Float(1.0) + c[13] * (ms[0] * ms[0] + ms[1] * ms[1]);
+-        pj[0] = ms[0] * rd - p0 * f_p2;
+-        pj[1] = ms[1] * rd - p1 * f_p2;
+-      } else {
+-        pj[0] = ms[0] - p0 * c[0] / p2;
+-        pj[1] = ms[1] - p1 * c[0] / p2;
+-      }
+-    }
+-  }
+-}
+-
+-template <class Float>
+-void ComputeProjectionX(size_t nproj, const Float* camera, const Float* point,
+-                        const Float* ms, const int* jmap, Float* pj, int radial,
+-                        int mt);
+-
+-DEFINE_THREAD_DATA(ComputeProjectionX)
+-size_t nproj;
+-const Float *camera, *point, *ms;
+-const int* jmap;
+-Float* pj;
+-int radial_distortion;
+-BEGIN_THREAD_PROC(ComputeProjectionX)
+-ComputeProjectionX(q->nproj, q->camera, q->point, q->ms, q->jmap, q->pj,
+-                   q->radial_distortion, 0);
+-END_THREAD_RPOC(ComputeProjectionX)
+-
+-template <class Float>
+-void ComputeProjectionX(size_t nproj, const Float* camera, const Float* point,
+-                        const Float* ms, const int* jmap, Float* pj, int radial,
+-                        int mt) {
+-  if (mt > 1 && nproj >= mt) {
+-    MYTHREAD threads[THREAD_NUM_MAX];
+-    const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
+-    for (size_t i = 0; i < thread_num; ++i) {
+-      size_t first = nproj * i / thread_num;
+-      size_t last_ = nproj * (i + 1) / thread_num;
+-      size_t last = std::min(last_, nproj);
+-      RUN_THREAD(ComputeProjectionX, threads[i], last - first, camera, point,
+-                 ms + 2 * first, jmap + 2 * first, pj + 2 * first, radial);
+-    }
+-    WAIT_THREAD(threads, thread_num);
+-  } else {
+-    for (size_t i = 0; i < nproj; ++i, jmap += 2, ms += 2, pj += 2) {
+-      const Float* c = camera + jmap[0] * 16;
+-      const Float* m = point + jmap[1] * POINT_ALIGN;
+-      /////////////////////////////////////////////////////
+-      Float p0 = c[4] * m[0] + c[5] * m[1] + c[6] * m[2] + c[1];
+-      Float p1 = c[7] * m[0] + c[8] * m[1] + c[9] * m[2] + c[2];
+-      Float p2 = c[10] * m[0] + c[11] * m[1] + c[12] * m[2] + c[3];
+-      if (radial == 1) {
+-        Float rr = Float(1.0) + c[13] * (p0 * p0 + p1 * p1) / (p2 * p2);
+-        Float f_p2 = c[0] / p2;
+-        pj[0] = ms[0] / rr - p0 * f_p2;
+-        pj[1] = ms[1] / rr - p1 * f_p2;
+-      } else if (radial == -1) {
+-        Float rd = Float(1.0) + c[13] * (ms[0] * ms[0] + ms[1] * ms[1]);
+-        Float f_p2 = c[0] / p2 / rd;
+-        pj[0] = ms[0] - p0 * f_p2;
+-        pj[1] = ms[1] - p1 * f_p2;
+-      } else {
+-        pj[0] = ms[0] - p0 * c[0] / p2;
+-        pj[1] = ms[1] - p1 * c[0] / p2;
+-      }
+-    }
+-  }
+-}
+-
+-template <class Float>
+-void ComputeProjectionQ(size_t nq, const Float* camera, const int* qmap,
+-                        const Float* wq, Float* pj) {
+-  for (size_t i = 0; i < nq; ++i, qmap += 2, pj += 2, wq += 2) {
+-    const Float* c1 = camera + qmap[0] * 16;
+-    const Float* c2 = camera + qmap[1] * 16;
+-    pj[0] = -(c1[0] - c2[0]) * wq[0];
+-    pj[1] = -(c1[13] - c2[13]) * wq[1];
+-  }
+-}
+-
+-template <class Float>
+-void ComputeJQX(size_t nq, const Float* x, const int* qmap, const Float* wq,
+-                const Float* sj, Float* jx) {
+-  if (sj) {
+-    for (size_t i = 0; i < nq; ++i, qmap += 2, jx += 2, wq += 2) {
+-      int idx1 = qmap[0] * 8, idx2 = qmap[1] * 8;
+-      const Float* x1 = x + idx1;
+-      const Float* x2 = x + idx2;
+-      const Float* sj1 = sj + idx1;
+-      const Float* sj2 = sj + idx2;
+-      jx[0] = (x1[0] * sj1[0] - x2[0] * sj2[0]) * wq[0];
+-      jx[1] = (x1[7] * sj1[7] - x2[7] * sj2[7]) * wq[1];
+-    }
+-  } else {
+-    for (size_t i = 0; i < nq; ++i, qmap += 2, jx += 2, wq += 2) {
+-      const Float* x1 = x + qmap[0] * 8;
+-      const Float* x2 = x + qmap[1] * 8;
+-      jx[0] = (x1[0] - x2[0]) * wq[0];
+-      jx[1] = (x1[7] - x2[7]) * wq[1];
+-    }
+-  }
+-}
+-
+-template <class Float>
+-void ComputeJQtEC(size_t ncam, const Float* pe, const int* qlist,
+-                  const Float* wq, const Float* sj, Float* v) {
+-  if (sj) {
+-    for (size_t i = 0; i < ncam; ++i, qlist += 2, wq += 2, v += 8, sj += 8) {
+-      int ip = qlist[0];
+-      if (ip == -1) continue;
+-      int in = qlist[1];
+-      const Float* e1 = pe + ip * 2;
+-      const Float* e2 = pe + in * 2;
+-      v[0] += wq[0] * sj[0] * (e1[0] - e2[0]);
+-      v[7] += wq[1] * sj[7] * (e1[1] - e2[1]);
+-    }
+-  } else {
+-    for (size_t i = 0; i < ncam; ++i, qlist += 2, wq += 2, v += 8) {
+-      int ip = qlist[0];
+-      if (ip == -1) continue;
+-      int in = qlist[1];
+-      const Float* e1 = pe + ip * 2;
+-      const Float* e2 = pe + in * 2;
+-      v[0] += wq[0] * (e1[0] - e2[0]);
+-      v[7] += wq[1] * (e1[1] - e2[1]);
+-    }
+-  }
+-}
+-
+-template <class Float>
+-inline void JacobianOne(const Float* c, const Float* pt, const Float* ms,
+-                        Float* jxc, Float* jyc, Float* jxp, Float* jyp,
+-                        bool intrinsic_fixed, int radial_distortion) {
+-  const Float* r = c + 4;
+-  Float x0 = c[4] * pt[0] + c[5] * pt[1] + c[6] * pt[2];
+-  Float y0 = c[7] * pt[0] + c[8] * pt[1] + c[9] * pt[2];
+-  Float z0 = c[10] * pt[0] + c[11] * pt[1] + c[12] * pt[2];
+-  Float p2 = (z0 + c[3]);
+-  Float f_p2 = c[0] / p2;
+-  Float p0_p2 = (x0 + c[1]) / p2;
+-  Float p1_p2 = (y0 + c[2]) / p2;
+-
+-  if (radial_distortion == 1) {
+-    Float rr1 = c[13] * p0_p2 * p0_p2;
+-    Float rr2 = c[13] * p1_p2 * p1_p2;
+-    Float f_p2_x = Float(f_p2 * (1.0 + 3.0 * rr1 + rr2));
+-    Float f_p2_y = Float(f_p2 * (1.0 + 3.0 * rr2 + rr1));
+-    if (jxc) {
+-#ifndef PBA_DISABLE_CONST_CAMERA
+-      if (c[15] != 0.0f) {
+-        jxc[0] = 0;
+-        jxc[1] = 0;
+-        jxc[2] = 0;
+-        jxc[3] = 0;
+-        jxc[4] = 0;
+-        jxc[5] = 0;
+-        jxc[6] = 0;
+-        jxc[7] = 0;
+-        jyc[0] = 0;
+-        jyc[1] = 0;
+-        jyc[2] = 0;
+-        jyc[3] = 0;
+-        jyc[4] = 0;
+-        jyc[5] = 0;
+-        jyc[6] = 0;
+-        jyc[7] = 0;
+-      } else
+-#endif
+-      {
+-        Float jfc = intrinsic_fixed ? 0 : Float(1.0 + rr1 + rr2);
+-        Float ft_x_pn =
+-            intrinsic_fixed ? 0 : c[0] * (p0_p2 * p0_p2 + p1_p2 * p1_p2);
+-        /////////////////////////////////////////////////////
+-        jxc[0] = p0_p2 * jfc;
+-        jxc[1] = f_p2_x;
+-        jxc[2] = 0;
+-        jxc[3] = -f_p2_x * p0_p2;
+-        jxc[4] = -f_p2_x * p0_p2 * y0;
+-        jxc[5] = f_p2_x * (z0 + x0 * p0_p2);
+-        jxc[6] = -f_p2_x * y0;
+-        jxc[7] = ft_x_pn * p0_p2;
+-
+-        jyc[0] = p1_p2 * jfc;
+-        jyc[1] = 0;
+-        jyc[2] = f_p2_y;
+-        jyc[3] = -f_p2_y * p1_p2;
+-        jyc[4] = -f_p2_y * (z0 + y0 * p1_p2);
+-        jyc[5] = f_p2_y * x0 * p1_p2;
+-        jyc[6] = f_p2_y * x0;
+-        jyc[7] = ft_x_pn * p1_p2;
+-      }
+-    }
+-
+-    ///////////////////////////////////
+-    if (jxp) {
+-      jxp[0] = f_p2_x * (r[0] - r[6] * p0_p2);
+-      jxp[1] = f_p2_x * (r[1] - r[7] * p0_p2);
+-      jxp[2] = f_p2_x * (r[2] - r[8] * p0_p2);
+-      jyp[0] = f_p2_y * (r[3] - r[6] * p1_p2);
+-      jyp[1] = f_p2_y * (r[4] - r[7] * p1_p2);
+-      jyp[2] = f_p2_y * (r[5] - r[8] * p1_p2);
+-#ifdef POINT_DATA_ALIGN4
+-      jxp[3] = jyp[3] = 0;
+-#endif
+-    }
+-  } else {
+-    if (jxc) {
+-#ifndef PBA_DISABLE_CONST_CAMERA
+-      if (c[15] != 0.0f) {
+-        jxc[0] = 0;
+-        jxc[1] = 0;
+-        jxc[2] = 0;
+-        jxc[3] = 0;
+-        jxc[4] = 0;
+-        jxc[5] = 0;
+-        jxc[6] = 0;
+-        jxc[7] = 0;
+-        jyc[0] = 0;
+-        jyc[1] = 0;
+-        jyc[2] = 0;
+-        jyc[3] = 0;
+-        jyc[4] = 0;
+-        jyc[5] = 0;
+-        jyc[6] = 0;
+-        jyc[7] = 0;
+-      } else
+-#endif
+-      {
+-        jxc[0] = intrinsic_fixed ? 0 : p0_p2;
+-        jxc[1] = f_p2;
+-        jxc[2] = 0;
+-        jxc[3] = -f_p2 * p0_p2;
+-        jxc[4] = -f_p2 * p0_p2 * y0;
+-        jxc[5] = f_p2 * (z0 + x0 * p0_p2);
+-        jxc[6] = -f_p2 * y0;
+-
+-        jyc[0] = intrinsic_fixed ? 0 : p1_p2;
+-        jyc[1] = 0;
+-        jyc[2] = f_p2;
+-        jyc[3] = -f_p2 * p1_p2;
+-        jyc[4] = -f_p2 * (z0 + y0 * p1_p2);
+-        jyc[5] = f_p2 * x0 * p1_p2;
+-        jyc[6] = f_p2 * x0;
+-
+-        if (radial_distortion == -1 && !intrinsic_fixed) {
+-          Float msn = ms[0] * ms[0] + ms[1] * ms[1];
+-          jxc[7] = -ms[0] * msn;
+-          jyc[7] = -ms[1] * msn;
+-        } else {
+-          jxc[7] = 0;
+-          jyc[7] = 0;
+-        }
+-      }
+-    }
+-    ///////////////////////////////////
+-    if (jxp) {
+-      jxp[0] = f_p2 * (r[0] - r[6] * p0_p2);
+-      jxp[1] = f_p2 * (r[1] - r[7] * p0_p2);
+-      jxp[2] = f_p2 * (r[2] - r[8] * p0_p2);
+-      jyp[0] = f_p2 * (r[3] - r[6] * p1_p2);
+-      jyp[1] = f_p2 * (r[4] - r[7] * p1_p2);
+-      jyp[2] = f_p2 * (r[5] - r[8] * p1_p2);
+-#ifdef POINT_DATA_ALIGN4
+-      jxp[3] = jyp[3] = 0;
+-#endif
+-    }
+-  }
+-}
+-
+-template <class Float>
+-void ComputeJacobian(size_t nproj, size_t ncam, const Float* camera,
+-                     const Float* point, Float* jc, Float* jp, const int* jmap,
+-                     const Float* sj, const Float* ms, const int* cmlist,
+-                     bool intrinsic_fixed, int radial_distortion, bool shuffle,
+-                     Float* jct, int mt = 2, int i0 = 0);
+-
+-DEFINE_THREAD_DATA(ComputeJacobian)
+-size_t nproj, ncam;
+-const Float *camera, *point;
+-Float *jc, *jp;
+-const int* jmap;
+-const Float *sj, *ms;
+-const int* cmlist;
+-bool intrinsic_fixed;
+-int radial_distortion;
+-bool shuffle;
+-Float* jct;
+-int i0;
+-BEGIN_THREAD_PROC(ComputeJacobian)
+-ComputeJacobian(q->nproj, q->ncam, q->camera, q->point, q->jc, q->jp, q->jmap,
+-                q->sj, q->ms, q->cmlist, q->intrinsic_fixed,
+-                q->radial_distortion, q->shuffle, q->jct, 0, q->i0);
+-END_THREAD_RPOC(ComputeJacobian)
+-
+-template <class Float>
+-void ComputeJacobian(size_t nproj, size_t ncam, const Float* camera,
+-                     const Float* point, Float* jc, Float* jp, const int* jmap,
+-                     const Float* sj, const Float* ms, const int* cmlist,
+-                     bool intrinsic_fixed, int radial_distortion, bool shuffle,
+-                     Float* jct, int mt, int i0) {
+-  if (mt > 1 && nproj >= mt) {
+-    MYTHREAD threads[THREAD_NUM_MAX];
+-    const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
+-    for (size_t i = 0; i < thread_num; ++i) {
+-      size_t first = nproj * i / thread_num;
+-      size_t last_ = nproj * (i + 1) / thread_num;
+-      size_t last = std::min(last_, nproj);
+-      RUN_THREAD(ComputeJacobian, threads[i], last, ncam, camera, point, jc, jp,
+-                 jmap + 2 * first, sj, ms + 2 * first, cmlist + first,
+-                 intrinsic_fixed, radial_distortion, shuffle, jct, first);
+-    }
+-    WAIT_THREAD(threads, thread_num);
+-  } else {
+-    const Float* sjc0 = sj;
+-    const Float* sjp0 = sj ? sj + ncam * 8 : NULL;
+-
+-    for (size_t i = i0; i < nproj; ++i, jmap += 2, ms += 2, ++cmlist) {
+-      int cidx = jmap[0], pidx = jmap[1];
+-      const Float *c = camera + cidx * 16, *pt = point + pidx * POINT_ALIGN;
+-      Float* jci = jc ? (jc + (shuffle ? cmlist[0] : i) * 16) : NULL;
+-      Float* jpi = jp ? (jp + i * POINT_ALIGN2) : NULL;
+-
+-      /////////////////////////////////////////////////////
+-      JacobianOne(c, pt, ms, jci, jci + 8, jpi, jpi + POINT_ALIGN,
+-                  intrinsic_fixed, radial_distortion);
+-
+-      ///////////////////////////////////////////////////
+-      if (sjc0) {
+-        // jacobian scaling
+-        if (jci) {
+-          ScaleJ8(jci, jci + 8, sjc0 + cidx * 8);
+-        }
+-        if (jpi) {
+-          const Float* sjp = sjp0 + pidx * POINT_ALIGN;
+-          for (int j = 0; j < 3; ++j) {
+-            jpi[j] *= sjp[j];
+-            jpi[POINT_ALIGN + j] *= sjp[j];
+-          }
+-        }
+-      }
+-
+-      if (jct && jc) MemoryCopyB(jci, jci + 16, jct + cmlist[0] * 16);
+-    }
+-  }
+-}
+-
+-template <class Float>
+-void ComputeDiagonalAddQ(size_t ncam, const Float* qw, Float* d,
+-                         const Float* sj = NULL) {
+-  if (sj) {
+-    for (size_t i = 0; i < ncam; ++i, qw += 2, d += 8, sj += 8) {
+-      if (qw[0] == 0) continue;
+-      Float j1 = qw[0] * sj[0];
+-      Float j2 = qw[1] * sj[7];
+-      d[0] += (j1 * j1 * 2.0f);
+-      d[7] += (j2 * j2 * 2.0f);
+-    }
+-  } else {
+-    for (size_t i = 0; i < ncam; ++i, qw += 2, d += 8) {
+-      if (qw[0] == 0) continue;
+-      d[0] += (qw[0] * qw[0] * 2.0f);
+-      d[7] += (qw[1] * qw[1] * 2.0f);
+-    }
+-  }
+-}
+-
+-///////////////////////////////////////
+-template <class Float>
+-void ComputeDiagonal(const avec<Float>& jcv, const vector<int>& cmapv,
+-                     const avec<Float>& jpv, const vector<int>& pmapv,
+-                     const vector<int>& cmlistv, const Float* qw0,
+-                     avec<Float>& jtjdi, bool jc_transpose, int radial) {
+-  // first camera part
+-  if (jcv.size() == 0 || jpv.size() == 0) return;  // not gonna happen
+-
+-  size_t ncam = cmapv.size() - 1, npts = pmapv.size() - 1;
+-  const int vn = radial ? 8 : 7;
+-  SetVectorZero(jtjdi);
+-
+-  const int* cmap = &cmapv[0];
+-  const int* pmap = &pmapv[0];
+-  const int* cmlist = &cmlistv[0];
+-  const Float* jc = &jcv[0];
+-  const Float* jp = &jpv[0];
+-  const Float* qw = qw0;
+-  Float* jji = &jtjdi[0];
+-
+-  ///////compute jc part
+-  for (size_t i = 0; i < ncam; ++i, jji += 8, ++cmap, qw += 2) {
+-    int idx1 = cmap[0], idx2 = cmap[1];
+-    //////////////////////////////////////
+-    for (int j = idx1; j < idx2; ++j) {
+-      int idx = jc_transpose ? j : cmlist[j];
+-      const Float* pj = jc + idx * 16;
+-      ///////////////////////////////////////////
+-      for (int k = 0; k < vn; ++k)
+-        jji[k] += (pj[k] * pj[k] + pj[k + 8] * pj[k + 8]);
+-    }
+-    if (qw0 && qw[0] > 0) {
+-      jji[0] += (qw[0] * qw[0] * 2.0f);
+-      jji[7] += (qw[1] * qw[1] * 2.0f);
+-    }
+-  }
+-
+-  for (size_t i = 0; i < npts; ++i, jji += POINT_ALIGN, ++pmap) {
+-    int idx1 = pmap[0], idx2 = pmap[1];
+-    const Float* pj = jp + idx1 * POINT_ALIGN2;
+-    for (int j = idx1; j < idx2; ++j, pj += POINT_ALIGN2) {
+-      for (int k = 0; k < 3; ++k)
+-        jji[k] += (pj[k] * pj[k] + pj[k + POINT_ALIGN] * pj[k + POINT_ALIGN]);
+-    }
+-  }
+-  Float* it = jtjdi.begin();
+-  for (; it < jtjdi.end(); ++it) {
+-    *it = (*it == 0) ? 0 : Float(1.0 / (*it));
+-  }
+-}
+-
+-template <class T, int n, int m>
+-void InvertSymmetricMatrix(T a[n][m], T ai[n][m]) {
+-  for (int i = 0; i < n; ++i) {
+-    if (a[i][i] > 0) {
+-      a[i][i] = sqrt(a[i][i]);
+-      for (int j = i + 1; j < n; ++j) a[j][i] = a[j][i] / a[i][i];
+-      for (int j = i + 1; j < n; ++j)
+-        for (int k = j; k < n; ++k) a[k][j] -= a[k][i] * a[j][i];
+-    }
+-  }
+-  /////////////////////////////
+-  // inv(L)
+-  for (int i = 0; i < n; ++i) {
+-    if (a[i][i] == 0) continue;
+-    a[i][i] = 1.0f / a[i][i];
+-  }
+-  for (int i = 1; i < n; ++i) {
+-    if (a[i][i] == 0) continue;
+-    for (int j = 0; j < i; ++j) {
+-      T sum = 0;
+-      for (int k = j; k < i; ++k) sum += (a[i][k] * a[k][j]);
+-      a[i][j] = -sum * a[i][i];
+-    }
+-  }
+-  /////////////////////////////
+-  // inv(L)'  * inv(L)
+-  for (int i = 0; i < n; ++i) {
+-    for (int j = i; j < n; ++j) {
+-      ai[i][j] = 0;
+-      for (int k = j; k < n; ++k) ai[i][j] += a[k][i] * a[k][j];
+-      ai[j][i] = ai[i][j];
+-    }
+-  }
+-}
+-template <class T, int n, int m>
+-void InvertSymmetricMatrix(T* a, T* ai) {
+-  InvertSymmetricMatrix<T, n, m>((T(*)[m])a, (T(*)[m])ai);
+-}
+-
+-template <class Float>
+-void ComputeDiagonalBlockC(size_t ncam, float lambda1, float lambda2,
+-                           const Float* jc, const int* cmap, const int* cmlist,
+-                           Float* di, Float* bi, int vn, bool jc_transpose,
+-                           bool use_jq, int mt);
+-
+-DEFINE_THREAD_DATA(ComputeDiagonalBlockC)
+-size_t ncam;
+-float lambda1, lambda2;
+-const Float* jc;
+-const int *cmap, *cmlist;
+-Float *di, *bi;
+-int vn;
+-bool jc_transpose, use_jq;
+-BEGIN_THREAD_PROC(ComputeDiagonalBlockC)
+-ComputeDiagonalBlockC(q->ncam, q->lambda1, q->lambda2, q->jc, q->cmap,
+-                      q->cmlist, q->di, q->bi, q->vn, q->jc_transpose,
+-                      q->use_jq, 0);
+-END_THREAD_RPOC(ComputeDiagonalBlockC)
+-
+-template <class Float>
+-void ComputeDiagonalBlockC(size_t ncam, float lambda1, float lambda2,
+-                           const Float* jc, const int* cmap, const int* cmlist,
+-                           Float* di, Float* bi, int vn, bool jc_transpose,
+-                           bool use_jq, int mt) {
+-  const size_t bc = vn * 8;
+-
+-  if (mt > 1 && ncam >= (size_t)mt) {
+-    MYTHREAD threads[THREAD_NUM_MAX];
+-    const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
+-    for (size_t i = 0; i < thread_num; ++i) {
+-      size_t first = ncam * i / thread_num;
+-      size_t last_ = ncam * (i + 1) / thread_num;
+-      size_t last = std::min(last_, ncam);
+-      RUN_THREAD(ComputeDiagonalBlockC, threads[i], (last - first), lambda1,
+-                 lambda2, jc, cmap + first, cmlist, di + 8 * first,
+-                 bi + bc * first, vn, jc_transpose, use_jq);
+-    }
+-    WAIT_THREAD(threads, thread_num);
+-  } else {
+-    Float bufv[64 + 8];  // size_t offset = ((size_t)bufv) & 0xf;
+-    // Float* pbuf = bufv + ((16 - offset) / sizeof(Float));
+-    Float* pbuf = (Float*)ALIGN_PTR(bufv);
+-
+-    ///////compute jc part
+-    for (size_t i = 0; i < ncam; ++i, ++cmap, bi += bc) {
+-      int idx1 = cmap[0], idx2 = cmap[1];
+-      //////////////////////////////////////
+-      if (idx1 == idx2) {
+-        SetVectorZero(bi, bi + bc);
+-      } else {
+-        SetVectorZero(pbuf, pbuf + 64);
+-
+-        for (int j = idx1; j < idx2; ++j) {
+-          int idx = jc_transpose ? j : cmlist[j];
+-          const Float* pj = jc + idx * 16;
+-          /////////////////////////////////
+-          AddBlockJtJ(pj, pbuf, vn);
+-          AddBlockJtJ(pj + 8, pbuf, vn);
+-        }
+-
+-        // change and copy the diagonal
+-
+-        if (use_jq) {
+-          Float* pb = pbuf;
+-          for (int j = 0; j < 8; ++j, ++di, pb += 9) {
+-            Float temp;
+-            di[0] = temp = (di[0] + pb[0]);
+-            pb[0] = lambda2 * temp + lambda1;
+-          }
+-        } else {
+-          Float* pb = pbuf;
+-          for (int j = 0; j < 8; ++j, ++di, pb += 9) {
+-            *pb = lambda2 * ((*di) = (*pb)) + lambda1;
+-          }
+-        }
+-
+-        // invert the matrix?
+-        if (vn == 8)
+-          InvertSymmetricMatrix<Float, 8, 8>(pbuf, bi);
+-        else
+-          InvertSymmetricMatrix<Float, 7, 8>(pbuf, bi);
+-      }
+-    }
+-  }
+-}
+-
+-template <class Float>
+-void ComputeDiagonalBlockP(size_t npt, float lambda1, float lambda2,
+-                           const Float* jp, const int* pmap, Float* di,
+-                           Float* bi, int mt);
+-
+-DEFINE_THREAD_DATA(ComputeDiagonalBlockP)
+-size_t npt;
+-float lambda1, lambda2;
+-const Float* jp;
+-const int* pmap;
+-Float *di, *bi;
+-BEGIN_THREAD_PROC(ComputeDiagonalBlockP)
+-ComputeDiagonalBlockP(q->npt, q->lambda1, q->lambda2, q->jp, q->pmap, q->di,
+-                      q->bi, 0);
+-END_THREAD_RPOC(ComputeDiagonalBlockP)
+-
+-template <class Float>
+-void ComputeDiagonalBlockP(size_t npt, float lambda1, float lambda2,
+-                           const Float* jp, const int* pmap, Float* di,
+-                           Float* bi, int mt) {
+-  if (mt > 1) {
+-    MYTHREAD threads[THREAD_NUM_MAX];
+-    const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
+-    for (size_t i = 0; i < thread_num; ++i) {
+-      size_t first = npt * i / thread_num;
+-      size_t last_ = npt * (i + 1) / thread_num;
+-      size_t last = std::min(last_, npt);
+-      RUN_THREAD(ComputeDiagonalBlockP, threads[i], (last - first), lambda1,
+-                 lambda2, jp, pmap + first, di + POINT_ALIGN * first,
+-                 bi + 6 * first);
+-    }
+-    WAIT_THREAD(threads, thread_num);
+-  } else {
+-    for (size_t i = 0; i < npt; ++i, ++pmap, di += POINT_ALIGN, bi += 6) {
+-      int idx1 = pmap[0], idx2 = pmap[1];
+-
+-      Float M00 = 0, M01 = 0, M02 = 0, M11 = 0, M12 = 0, M22 = 0;
+-      const Float *jxp = jp + idx1 * (POINT_ALIGN2), *jyp = jxp + POINT_ALIGN;
+-      for (int j = idx1; j < idx2;
+-           ++j, jxp += POINT_ALIGN2, jyp += POINT_ALIGN2) {
+-        M00 += (jxp[0] * jxp[0] + jyp[0] * jyp[0]);
+-        M01 += (jxp[0] * jxp[1] + jyp[0] * jyp[1]);
+-        M02 += (jxp[0] * jxp[2] + jyp[0] * jyp[2]);
+-        M11 += (jxp[1] * jxp[1] + jyp[1] * jyp[1]);
+-        M12 += (jxp[1] * jxp[2] + jyp[1] * jyp[2]);
+-        M22 += (jxp[2] * jxp[2] + jyp[2] * jyp[2]);
+-      }
+-
+-      /////////////////////////////////
+-      di[0] = M00;
+-      di[1] = M11;
+-      di[2] = M22;
+-
+-      /////////////////////////////
+-      M00 = M00 * lambda2 + lambda1;
+-      M11 = M11 * lambda2 + lambda1;
+-      M22 = M22 * lambda2 + lambda1;
+-
+-      ///////////////////////////////
+-      Float det = (M00 * M11 - M01 * M01) * M22 + Float(2.0) * M01 * M12 * M02 -
+-                  M02 * M02 * M11 - M12 * M12 * M00;
+-      if (det >= FLT_MAX || det <= FLT_MIN * 2.0f) {
+-        // SetVectorZero(bi, bi + 6);
+-        for (int j = 0; j < 6; ++j) bi[j] = 0;
+-      } else {
+-        bi[0] = (M11 * M22 - M12 * M12) / det;
+-        bi[1] = -(M01 * M22 - M12 * M02) / det;
+-        bi[2] = (M01 * M12 - M02 * M11) / det;
+-        bi[3] = (M00 * M22 - M02 * M02) / det;
+-        bi[4] = -(M00 * M12 - M01 * M02) / det;
+-        bi[5] = (M00 * M11 - M01 * M01) / det;
+-      }
+-    }
+-  }
+-}
+-
+-template <class Float>
+-void ComputeDiagonalBlock(size_t ncam, size_t npts, float lambda, bool dampd,
+-                          const Float* jc, const int* cmap, const Float* jp,
+-                          const int* pmap, const int* cmlist, const Float* sj,
+-                          const Float* wq, Float* diag, Float* blocks,
+-                          int radial_distortion, bool jc_transpose, int mt1 = 2,
+-                          int mt2 = 2, int mode = 0) {
+-  const int vn = radial_distortion ? 8 : 7;
+-  const size_t bc = vn * 8;
+-  float lambda1 = dampd ? 0.0f : lambda;
+-  float lambda2 = dampd ? (1.0f + lambda) : 1.0f;
+-
+-  if (mode == 0) {
+-    const size_t bsz = bc * ncam + npts * 6;
+-    const size_t dsz = 8 * ncam + npts * POINT_ALIGN;
+-    bool use_jq = wq != NULL;
+-    ///////////////////////////////////////////
+-    SetVectorZero(blocks, blocks + bsz);
+-    SetVectorZero(diag, diag + dsz);
+-
+-    ////////////////////////////////
+-    if (use_jq) ComputeDiagonalAddQ(ncam, wq, diag, sj);
+-    ComputeDiagonalBlockC(ncam, lambda1, lambda2, jc, cmap, cmlist, diag,
+-                          blocks, vn, jc_transpose, use_jq, mt1);
+-    ComputeDiagonalBlockP(npts, lambda1, lambda2, jp, pmap, diag + 8 * ncam,
+-                          blocks + bc * ncam, mt2);
+-  } else if (mode == 1) {
+-    const size_t bsz = bc * ncam;
+-    const size_t dsz = 8 * ncam;
+-    bool use_jq = wq != NULL;
+-    ///////////////////////////////////////////
+-    SetVectorZero(blocks, blocks + bsz);
+-    SetVectorZero(diag, diag + dsz);
+-
+-    ////////////////////////////////
+-    if (use_jq) ComputeDiagonalAddQ(ncam, wq, diag, sj);
+-    ComputeDiagonalBlockC(ncam, lambda1, lambda2, jc, cmap, cmlist, diag,
+-                          blocks, vn, jc_transpose, use_jq, mt1);
+-  } else {
+-    blocks += bc * ncam;
+-    diag += 8 * ncam;
+-    const size_t bsz = npts * 6;
+-    const size_t dsz = npts * POINT_ALIGN;
+-    ///////////////////////////////////////////
+-    SetVectorZero(blocks, blocks + bsz);
+-    SetVectorZero(diag, diag + dsz);
+-
+-    ////////////////////////////////
+-    ComputeDiagonalBlockP(npts, lambda1, lambda2, jp, pmap, diag, blocks, mt2);
+-  }
+-}
+-
+-template <class Float>
+-void ComputeDiagonalBlock_(float lambda, bool dampd, const avec<Float>& camerav,
+-                           const avec<Float>& pointv, const avec<Float>& meas,
+-                           const vector<int>& jmapv, const avec<Float>& sjv,
+-                           avec<Float>& qwv, avec<Float>& diag,
+-                           avec<Float>& blocks, bool intrinsic_fixed,
+-                           int radial_distortion, int mode = 0) {
+-  const int vn = radial_distortion ? 8 : 7;
+-  const size_t szbc = vn * 8;
+-  size_t ncam = camerav.size() / 16;
+-  size_t npts = pointv.size() / POINT_ALIGN;
+-  size_t sz_jcd = ncam * 8;
+-  size_t sz_jcb = ncam * szbc;
+-  avec<Float> blockpv(blocks.size());
+-  SetVectorZero(blockpv);
+-  SetVectorZero(diag);
+-  //////////////////////////////////////////////////////
+-  float lambda1 = dampd ? 0.0f : lambda;
+-  float lambda2 = dampd ? (1.0f + lambda) : 1.0f;
+-
+-  Float jbufv[24 + 8];  // size_t offset = ((size_t) jbufv) & 0xf;
+-  // Float* jxc = jbufv + ((16 - offset) / sizeof(Float));
+-  Float* jxc = (Float*)ALIGN_PTR(jbufv);
+-  Float *jyc = jxc + 8, *jxp = jxc + 16, *jyp = jxc + 20;
+-
+-  //////////////////////////////
+-  const int* jmap = &jmapv[0];
+-  const Float* camera = &camerav[0];
+-  const Float* point = &pointv[0];
+-  const Float* ms = &meas[0];
+-  const Float* sjc0 = sjv.size() ? &sjv[0] : NULL;
+-  const Float* sjp0 = sjv.size() ? &sjv[sz_jcd] : NULL;
+-  //////////////////////////////////////////////
+-  Float *blockpc = &blockpv[0], *blockpp = &blockpv[sz_jcb];
+-  Float *bo = blockpc, *bi = &blocks[0], *di = &diag[0];
+-
+-  /////////////////////////////////////////////////////////
+-  // diagonal blocks
+-  for (size_t i = 0; i < jmapv.size(); i += 2, jmap += 2, ms += 2) {
+-    int cidx = jmap[0], pidx = jmap[1];
+-    const Float *c = camera + cidx * 16, *pt = point + pidx * POINT_ALIGN;
+-    /////////////////////////////////////////////////////////
+-    JacobianOne(c, pt, ms, jxc, jyc, jxp, jyp, intrinsic_fixed,
+-                radial_distortion);
+-
+-    ///////////////////////////////////////////////////////////
+-    if (mode != 2) {
+-      if (sjc0) {
+-        const Float* sjc = sjc0 + cidx * 8;
+-        ScaleJ8(jxc, jyc, sjc);
+-      }
+-      /////////////////////////////////////////
+-      Float* bc = blockpc + cidx * szbc;
+-      AddBlockJtJ(jxc, bc, vn);
+-      AddBlockJtJ(jyc, bc, vn);
+-    }
+-
+-    if (mode != 1) {
+-      if (sjp0) {
+-        const Float* sjp = sjp0 + pidx * POINT_ALIGN;
+-        jxp[0] *= sjp[0];
+-        jxp[1] *= sjp[1];
+-        jxp[2] *= sjp[2];
+-        jyp[0] *= sjp[0];
+-        jyp[1] *= sjp[1];
+-        jyp[2] *= sjp[2];
+-      }
+-
+-      ///////////////////////////////////////////
+-      Float* bp = blockpp + pidx * 6;
+-      bp[0] += (jxp[0] * jxp[0] + jyp[0] * jyp[0]);
+-      bp[1] += (jxp[0] * jxp[1] + jyp[0] * jyp[1]);
+-      bp[2] += (jxp[0] * jxp[2] + jyp[0] * jyp[2]);
+-      bp[3] += (jxp[1] * jxp[1] + jyp[1] * jyp[1]);
+-      bp[4] += (jxp[1] * jxp[2] + jyp[1] * jyp[2]);
+-      bp[5] += (jxp[2] * jxp[2] + jyp[2] * jyp[2]);
+-    }
+-  }
+-
+-  /// invert the camera part
+-  if (mode != 2) {
+-    /////////////////////////////////////////
+-    const Float* qw = qwv.begin();
+-    if (qw) {
+-      for (size_t i = 0; i < ncam; ++i, qw += 2) {
+-        if (qw[0] == 0) continue;
+-        Float* bc = blockpc + i * szbc;
+-        if (sjc0) {
+-          const Float* sjc = sjc0 + i * 8;
+-          Float j1 = sjc[0] * qw[0];
+-          Float j2 = sjc[7] * qw[1];
+-          bc[0] += (j1 * j1 * 2.0f);
+-          if (radial_distortion) bc[63] += (j2 * j2 * 2.0f);
+-        } else {
+-          const Float* sjc = sjc0 + i * 8;
+-          bc[0] += (qw[0] * qw[0] * 2.0f);
+-          if (radial_distortion) bc[63] += (qw[1] * qw[1] * 2.0f);
+-        }
+-      }
+-    }
+-
+-    for (size_t i = 0; i < ncam; ++i, bo += szbc, bi += szbc, di += 8) {
+-      Float *bp = bo, *dip = di;
+-      for (int j = 0; j < vn; ++j, ++dip, bp += 9) {
+-        dip[0] = bp[0];
+-        bp[0] = lambda2 * bp[0] + lambda1;
+-      }
+-
+-      // invert the matrix?
+-      if (radial_distortion)
+-        InvertSymmetricMatrix<Float, 8, 8>(bo, bi);
+-      else
+-        InvertSymmetricMatrix<Float, 7, 8>(bo, bi);
+-    }
+-  } else {
+-    bo += szbc * ncam;
+-    bi += szbc * ncam;
+-    di += 8 * ncam;
+-  }
+-
+-  ///////////////////////////////////////////
+-  // inverting the point part
+-  if (mode != 1) {
+-    for (size_t i = 0; i < npts; ++i, bo += 6, bi += 6, di += POINT_ALIGN) {
+-      Float &M00 = bo[0], &M01 = bo[1], &M02 = bo[2];
+-      Float &M11 = bo[3], &M12 = bo[4], &M22 = bo[5];
+-      di[0] = M00;
+-      di[1] = M11;
+-      di[2] = M22;
+-
+-      /////////////////////////////
+-      M00 = M00 * lambda2 + lambda1;
+-      M11 = M11 * lambda2 + lambda1;
+-      M22 = M22 * lambda2 + lambda1;
+-
+-      ///////////////////////////////
+-      Float det = (M00 * M11 - M01 * M01) * M22 + Float(2.0) * M01 * M12 * M02 -
+-                  M02 * M02 * M11 - M12 * M12 * M00;
+-      if (det >= FLT_MAX || det <= FLT_MIN * 2.0f) {
+-        for (int j = 0; j < 6; ++j) bi[j] = 0;
+-      } else {
+-        bi[0] = (M11 * M22 - M12 * M12) / det;
+-        bi[1] = -(M01 * M22 - M12 * M02) / det;
+-        bi[2] = (M01 * M12 - M02 * M11) / det;
+-        bi[3] = (M00 * M22 - M02 * M02) / det;
+-        bi[4] = -(M00 * M12 - M01 * M02) / det;
+-        bi[5] = (M00 * M11 - M01 * M01) / det;
+-      }
+-    }
+-  }
+-}
+-
+-template <class Float>
+-void MultiplyBlockConditionerC(int ncam, const Float* bi, const Float* x,
+-                               Float* vx, int vn, int mt = 0);
+-
+-DEFINE_THREAD_DATA(MultiplyBlockConditionerC)
+-int ncam;
+-const Float *bi, *x;
+-Float* vx;
+-int vn;
+-BEGIN_THREAD_PROC(MultiplyBlockConditionerC)
+-MultiplyBlockConditionerC(q->ncam, q->bi, q->x, q->vx, q->vn, 0);
+-END_THREAD_RPOC(MultiplyBlockConditionerC)
+-
+-template <class Float>
+-void MultiplyBlockConditionerC(int ncam, const Float* bi, const Float* x,
+-                               Float* vx, int vn, int mt) {
+-  if (mt > 1 && ncam >= mt) {
+-    const size_t bc = vn * 8;
+-    MYTHREAD threads[THREAD_NUM_MAX];
+-    const int thread_num = std::min(mt, THREAD_NUM_MAX);
+-    for (int i = 0; i < thread_num; ++i) {
+-      int first = ncam * i / thread_num;
+-      int last_ = ncam * (i + 1) / thread_num;
+-      int last = std::min(last_, ncam);
+-      RUN_THREAD(MultiplyBlockConditionerC, threads[i], (last - first),
+-                 bi + first * bc, x + 8 * first, vx + 8 * first, vn);
+-    }
+-    WAIT_THREAD(threads, thread_num);
+-  } else {
+-    for (int i = 0; i < ncam; ++i, x += 8, vx += 8) {
+-      Float* vxc = vx;
+-      for (int j = 0; j < vn; ++j, bi += 8, ++vxc) *vxc = DotProduct8(bi, x);
+-    }
+-  }
+-}
+-
+-template <class Float>
+-void MultiplyBlockConditionerP(int npoint, const Float* bi, const Float* x,
+-                               Float* vx, int mt = 0);
+-
+-DEFINE_THREAD_DATA(MultiplyBlockConditionerP)
+-int npoint;
+-const Float *bi, *x;
+-Float* vx;
+-BEGIN_THREAD_PROC(MultiplyBlockConditionerP)
+-MultiplyBlockConditionerP(q->npoint, q->bi, q->x, q->vx, 0);
+-END_THREAD_RPOC(MultiplyBlockConditionerP)
+-
+-template <class Float>
+-void MultiplyBlockConditionerP(int npoint, const Float* bi, const Float* x,
+-                               Float* vx, int mt) {
+-  if (mt > 1 && npoint >= mt) {
+-    MYTHREAD threads[THREAD_NUM_MAX];
+-    const int thread_num = std::min(mt, THREAD_NUM_MAX);
+-    for (int i = 0; i < thread_num; ++i) {
+-      int first = npoint * i / thread_num;
+-      int last_ = npoint * (i + 1) / thread_num;
+-      int last = std::min(last_, npoint);
+-      RUN_THREAD(MultiplyBlockConditionerP, threads[i], (last - first),
+-                 bi + first * 6, x + POINT_ALIGN * first,
+-                 vx + POINT_ALIGN * first);
+-    }
+-    WAIT_THREAD(threads, thread_num);
+-  } else {
+-    for (int i = 0; i < npoint;
+-         ++i, bi += 6, x += POINT_ALIGN, vx += POINT_ALIGN) {
+-      vx[0] = (bi[0] * x[0] + bi[1] * x[1] + bi[2] * x[2]);
+-      vx[1] = (bi[1] * x[0] + bi[3] * x[1] + bi[4] * x[2]);
+-      vx[2] = (bi[2] * x[0] + bi[4] * x[1] + bi[5] * x[2]);
+-    }
+-  }
+-}
+-
+-template <class Float>
+-void MultiplyBlockConditioner(int ncam, int npoint, const Float* blocksv,
+-                              const Float* vec, Float* resultv, int radial,
+-                              int mode, int mt1, int mt2) {
+-  const int vn = radial ? 8 : 7;
+-  if (mode != 2)
+-    MultiplyBlockConditionerC(ncam, blocksv, vec, resultv, vn, mt1);
+-  if (mt2 == 0) mt2 = AUTO_MT_NUM(npoint * 24);
+-  if (mode != 1)
+-    MultiplyBlockConditionerP(npoint, blocksv + (vn * 8 * ncam), vec + ncam * 8,
+-                              resultv + 8 * ncam, mt2);
+-}
+-
+-template <class Float>
+-void ComputeJX(size_t nproj, size_t ncam, const Float* x, const Float* jc,
+-               const Float* jp, const int* jmap, Float* jx, int mode,
+-               int mt = 2);
+-
+-DEFINE_THREAD_DATA(ComputeJX)
+-size_t nproj, ncam;
+-const Float *xc, *jc, *jp;
+-const int* jmap;
+-Float* jx;
+-int mode;
+-BEGIN_THREAD_PROC(ComputeJX)
+-ComputeJX(q->nproj, q->ncam, q->xc, q->jc, q->jp, q->jmap, q->jx, q->mode, 0);
+-END_THREAD_RPOC(ComputeJX)
+-
+-template <class Float>
+-void ComputeJX(size_t nproj, size_t ncam, const Float* x, const Float* jc,
+-               const Float* jp, const int* jmap, Float* jx, int mode, int mt) {
+-  if (mt > 1 && nproj >= mt) {
+-    MYTHREAD threads[THREAD_NUM_MAX];
+-    const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
+-    for (size_t i = 0; i < thread_num; ++i) {
+-      size_t first = nproj * i / thread_num;
+-      size_t last_ = nproj * (i + 1) / thread_num;
+-      size_t last = std::min(last_, nproj);
+-      RUN_THREAD(ComputeJX, threads[i], (last - first), ncam, x,
+-                 jc + 16 * first, jp + POINT_ALIGN2 * first, jmap + first * 2,
+-                 jx + first * 2, mode);
+-    }
+-    WAIT_THREAD(threads, thread_num);
+-  } else if (mode == 0) {
+-    const Float *pxc = x, *pxp = pxc + ncam * 8;
+-    // clock_t tp = clock(); double s1 = 0, s2  = 0;
+-    for (size_t i = 0; i < nproj;
+-         ++i, jmap += 2, jc += 16, jp += POINT_ALIGN2, jx += 2) {
+-      ComputeTwoJX(jc, jp, pxc + jmap[0] * 8, pxp + jmap[1] * POINT_ALIGN, jx);
+-    }
+-  } else if (mode == 1) {
+-    const Float* pxc = x;
+-    // clock_t tp = clock(); double s1 = 0, s2  = 0;
+-    for (size_t i = 0; i < nproj;
+-         ++i, jmap += 2, jc += 16, jp += POINT_ALIGN2, jx += 2) {
+-      const Float* xc = pxc + jmap[0] * 8;
+-      jx[0] = DotProduct8(jc, xc);
+-      jx[1] = DotProduct8(jc + 8, xc);
+-    }
+-  } else if (mode == 2) {
+-    const Float* pxp = x + ncam * 8;
+-    // clock_t tp = clock(); double s1 = 0, s2  = 0;
+-    for (size_t i = 0; i < nproj;
+-         ++i, jmap += 2, jc += 16, jp += POINT_ALIGN2, jx += 2) {
+-      const Float* xp = pxp + jmap[1] * POINT_ALIGN;
+-      jx[0] = (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
+-      jx[1] = (jp[3] * xp[0] + jp[4] * xp[1] + jp[5] * xp[2]);
+-    }
+-  }
+-}
+-
+-template <class Float>
+-void ComputeJX_(size_t nproj, size_t ncam, const Float* x, Float* jx,
+-                const Float* camera, const Float* point, const Float* ms,
+-                const Float* sj, const int* jmap, bool intrinsic_fixed,
+-                int radial_distortion, int mode, int mt = 16);
+-
+-DEFINE_THREAD_DATA(ComputeJX_)
+-size_t nproj, ncam;
+-const Float* x;
+-Float* jx;
+-const Float *camera, *point, *ms, *sj;
+-const int* jmap;
+-bool intrinsic_fixed;
+-int radial_distortion;
+-int mode;
+-BEGIN_THREAD_PROC(ComputeJX_)
+-ComputeJX_(q->nproj, q->ncam, q->x, q->jx, q->camera, q->point, q->ms, q->sj,
+-           q->jmap, q->intrinsic_fixed, q->radial_distortion, q->mode, 0);
+-END_THREAD_RPOC(ComputeJX_)
+-
+-template <class Float>
+-void ComputeJX_(size_t nproj, size_t ncam, const Float* x, Float* jx,
+-                const Float* camera, const Float* point, const Float* ms,
+-                const Float* sj, const int* jmap, bool intrinsic_fixed,
+-                int radial_distortion, int mode, int mt) {
+-  if (mt > 1 && nproj >= mt) {
+-    MYTHREAD threads[THREAD_NUM_MAX];
+-    const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
+-    for (size_t i = 0; i < thread_num; ++i) {
+-      size_t first = nproj * i / thread_num;
+-      size_t last_ = nproj * (i + 1) / thread_num;
+-      size_t last = std::min(last_, nproj);
+-      RUN_THREAD(ComputeJX_, threads[i], (last - first), ncam, x,
+-                 jx + first * 2, camera, point, ms + 2 * first, sj,
+-                 jmap + first * 2, intrinsic_fixed, radial_distortion, mode);
+-    }
+-    WAIT_THREAD(threads, thread_num);
+-  } else if (mode == 0) {
+-    Float jcv[24 + 8];  // size_t offset = ((size_t) jcv) & 0xf;
+-    // Float* jc = jcv + (16 - offset) / sizeof(Float), *jp = jc + 16;
+-    Float *jc = (Float *)ALIGN_PTR(jcv), *jp = jc + 16;
+-    ////////////////////////////////////////
+-    const Float* sjc = sj;
+-    const Float* sjp = sjc ? (sjc + ncam * 8) : NULL;
+-    const Float *xc0 = x, *xp0 = x + ncam * 8;
+-
+-    /////////////////////////////////
+-    for (size_t i = 0; i < nproj; ++i, ms += 2, jmap += 2, jx += 2) {
+-      const int cidx = jmap[0], pidx = jmap[1];
+-      const Float *c = camera + cidx * 16, *pt = point + pidx * POINT_ALIGN;
+-      /////////////////////////////////////////////////////
+-      JacobianOne(c, pt, ms, jc, jc + 8, jp, jp + POINT_ALIGN, intrinsic_fixed,
+-                  radial_distortion);
+-      if (sjc) {
+-        // jacobian scaling
+-        ScaleJ8(jc, jc + 8, sjc + cidx * 8);
+-        const Float* sjpi = sjp + pidx * POINT_ALIGN;
+-        for (int j = 0; j < 3; ++j) {
+-          jp[j] *= sjpi[j];
+-          jp[POINT_ALIGN + j] *= sjpi[j];
+-        }
+-      }
+-      ////////////////////////////////////
+-      ComputeTwoJX(jc, jp, xc0 + cidx * 8, xp0 + pidx * POINT_ALIGN, jx);
+-    }
+-  } else if (mode == 1) {
+-    Float jcv[24 + 8];  // size_t offset = ((size_t) jcv) & 0xf;
+-    // Float* jc = jcv + (16 - offset) / sizeof(Float);
+-    Float* jc = (Float*)ALIGN_PTR(jcv);
+-
+-    ////////////////////////////////////////
+-    const Float *sjc = sj, *xc0 = x;
+-
+-    /////////////////////////////////
+-    for (size_t i = 0; i < nproj; ++i, ms += 2, jmap += 2, jx += 2) {
+-      const int cidx = jmap[0], pidx = jmap[1];
+-      const Float *c = camera + cidx * 16, *pt = point + pidx * POINT_ALIGN;
+-      /////////////////////////////////////////////////////
+-      JacobianOne(c, pt, ms, jc, jc + 8, (Float*)NULL, (Float*)NULL,
+-                  intrinsic_fixed, radial_distortion);
+-      if (sjc) ScaleJ8(jc, jc + 8, sjc + cidx * 8);
+-      const Float* xc = xc0 + cidx * 8;
+-      jx[0] = DotProduct8(jc, xc);
+-      jx[1] = DotProduct8(jc + 8, xc);
+-    }
+-  } else if (mode == 2) {
+-    Float jp[8];
+-
+-    ////////////////////////////////////////
+-    const Float* sjp = sj ? (sj + ncam * 8) : NULL;
+-    const Float* xp0 = x + ncam * 8;
+-
+-    /////////////////////////////////
+-    for (size_t i = 0; i < nproj; ++i, ms += 2, jmap += 2, jx += 2) {
+-      const int cidx = jmap[0], pidx = jmap[1];
+-      const Float *c = camera + cidx * 16, *pt = point + pidx * POINT_ALIGN;
+-      /////////////////////////////////////////////////////
+-      JacobianOne(c, pt, ms, (Float*)NULL, (Float*)NULL, jp, jp + POINT_ALIGN,
+-                  intrinsic_fixed, radial_distortion);
+-
+-      const Float* xp = xp0 + pidx * POINT_ALIGN;
+-      if (sjp) {
+-        const Float* s = sjp + pidx * POINT_ALIGN;
+-        jx[0] = (jp[0] * xp[0] * s[0] + jp[1] * xp[1] * s[1] +
+-                 jp[2] * xp[2] * s[2]);
+-        jx[1] = (jp[3] * xp[0] * s[0] + jp[4] * xp[1] * s[1] +
+-                 jp[5] * xp[2] * s[2]);
+-      } else {
+-        jx[0] = (jp[0] * xp[0] + jp[1] * xp[1] + jp[2] * xp[2]);
+-        jx[1] = (jp[3] * xp[0] + jp[4] * xp[1] + jp[5] * xp[2]);
+-      }
+-    }
+-  }
+-}
+-
+-template <class Float>
+-void ComputeJtEC(size_t ncam, const Float* pe, const Float* jc, const int* cmap,
+-                 const int* cmlist, Float* v, bool jc_transpose, int mt);
+-
+-DEFINE_THREAD_DATA(ComputeJtEC)
+-size_t ncam;
+-const Float *pe, *jc;
+-const int *cmap, *cmlist;
+-Float* v;
+-bool jc_transpose;
+-BEGIN_THREAD_PROC(ComputeJtEC)
+-ComputeJtEC(q->ncam, q->pe, q->jc, q->cmap, q->cmlist, q->v, q->jc_transpose,
+-            0);
+-END_THREAD_RPOC(ComputeJtEC)
+-
+-template <class Float>
+-void ComputeJtEC(size_t ncam, const Float* pe, const Float* jc, const int* cmap,
+-                 const int* cmlist, Float* v, bool jc_transpose, int mt) {
+-  if (mt > 1 && ncam >= mt) {
+-    MYTHREAD threads[THREAD_NUM_MAX];  // if(ncam < mt) mt = ncam;
+-    const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
+-    for (size_t i = 0; i < thread_num; ++i) {
+-      size_t first = ncam * i / thread_num;
+-      size_t last_ = ncam * (i + 1) / thread_num;
+-      size_t last = std::min(last_, ncam);
+-      RUN_THREAD(ComputeJtEC, threads[i], (last - first), pe, jc, cmap + first,
+-                 cmlist, v + 8 * first, jc_transpose);
+-    }
+-    WAIT_THREAD(threads, thread_num);
+-  } else {
+-    /////////////////////////////////
+-    for (size_t i = 0; i < ncam; ++i, ++cmap, v += 8) {
+-      int idx1 = cmap[0], idx2 = cmap[1];
+-      for (int j = idx1; j < idx2; ++j) {
+-        int edx = cmlist[j];
+-        const Float* pj = jc + ((jc_transpose ? j : edx) * 16);
+-        const Float* e = pe + edx * 2;
+-        //////////////////////////////
+-        AddScaledVec8(e[0], pj, v);
+-        AddScaledVec8(e[1], pj + 8, v);
+-      }
+-    }
+-  }
+-}
+-
+-template <class Float>
+-void ComputeJtEP(size_t npt, const Float* pe, const Float* jp, const int* pmap,
+-                 Float* v, int mt);
+-
+-DEFINE_THREAD_DATA(ComputeJtEP)
+-size_t npt;
+-const Float *pe, *jp;
+-const int* pmap;
+-Float* v;
+-BEGIN_THREAD_PROC(ComputeJtEP)
+-ComputeJtEP(q->npt, q->pe, q->jp, q->pmap, q->v, 0);
+-END_THREAD_RPOC(ComputeJtEP)
+-
+-template <class Float>
+-void ComputeJtEP(size_t npt, const Float* pe, const Float* jp, const int* pmap,
+-                 Float* v, int mt) {
+-  if (mt > 1 && npt >= mt) {
+-    MYTHREAD threads[THREAD_NUM_MAX];
+-    const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
+-    for (size_t i = 0; i < thread_num; ++i) {
+-      size_t first = npt * i / thread_num;
+-      size_t last_ = npt * (i + 1) / thread_num;
+-      size_t last = std::min(last_, npt);
+-      RUN_THREAD(ComputeJtEP, threads[i], (last - first), pe, jp, pmap + first,
+-                 v + POINT_ALIGN * first);
+-    }
+-    WAIT_THREAD(threads, thread_num);
+-  } else {
+-    for (size_t i = 0; i < npt; ++i, ++pmap, v += POINT_ALIGN) {
+-      int idx1 = pmap[0], idx2 = pmap[1];
+-      const Float* pj = jp + idx1 * POINT_ALIGN2;
+-      const Float* e = pe + idx1 * 2;
+-      Float temp[3] = {0, 0, 0};
+-      for (int j = idx1; j < idx2; ++j, pj += POINT_ALIGN2, e += 2) {
+-        temp[0] += (e[0] * pj[0] + e[1] * pj[POINT_ALIGN]);
+-        temp[1] += (e[0] * pj[1] + e[1] * pj[POINT_ALIGN + 1]);
+-        temp[2] += (e[0] * pj[2] + e[1] * pj[POINT_ALIGN + 2]);
+-      }
+-      v[0] = temp[0];
+-      v[1] = temp[1];
+-      v[2] = temp[2];
+-    }
+-  }
+-}
+-
+-template <class Float>
+-void ComputeJtE(size_t ncam, size_t npt, const Float* pe, const Float* jc,
+-                const int* cmap, const int* cmlist, const Float* jp,
+-                const int* pmap, Float* v, bool jc_transpose, int mode, int mt1,
+-                int mt2) {
+-  if (mode != 2) {
+-    SetVectorZero(v, v + ncam * 8);
+-    ComputeJtEC(ncam, pe, jc, cmap, cmlist, v, jc_transpose, mt1);
+-  }
+-  if (mode != 1) {
+-    ComputeJtEP(npt, pe, jp, pmap, v + 8 * ncam, mt2);
+-  }
+-}
+-
+-template <class Float>
+-void ComputeJtEC_(size_t ncam, const Float* ee, Float* jte, const Float* c,
+-                  const Float* point, const Float* ms, const int* jmap,
+-                  const int* cmap, const int* cmlist, bool intrinsic_fixed,
+-                  int radial_distortion, int mt);
+-
+-DEFINE_THREAD_DATA(ComputeJtEC_)
+-size_t ncam;
+-const Float* ee;
+-Float* jte;
+-const Float *c, *point, *ms;
+-const int *jmap, *cmap, *cmlist;
+-bool intrinsic_fixed;
+-int radial_distortion;
+-BEGIN_THREAD_PROC(ComputeJtEC_)
+-ComputeJtEC_(q->ncam, q->ee, q->jte, q->c, q->point, q->ms, q->jmap, q->cmap,
+-             q->cmlist, q->intrinsic_fixed, q->radial_distortion, 0);
+-END_THREAD_RPOC(ComputeJtEC_)
+-
+-template <class Float>
+-void ComputeJtEC_(size_t ncam, const Float* ee, Float* jte, const Float* c,
+-                  const Float* point, const Float* ms, const int* jmap,
+-                  const int* cmap, const int* cmlist, bool intrinsic_fixed,
+-                  int radial_distortion, int mt) {
+-  if (mt > 1 && ncam >= mt) {
+-    MYTHREAD threads[THREAD_NUM_MAX];
+-    // if(ncam < mt) mt = ncam;
+-    const size_t thread_num = std::min(mt, THREAD_NUM_MAX);
+-    for (size_t i = 0; i < thread_num; ++i) {
+-      size_t first = ncam * i / thread_num;
+-      size_t last_ = ncam * (i + 1) / thread_num;
+-      size_t last = std::min(last_, ncam);
+-      RUN_THREAD(ComputeJtEC_, threads[i], (last - first), ee, jte + 8 * first,
+-                 c + first * 16, point, ms, jmap, cmap + first, cmlist,
+-                 intrinsic_fixed, radial_distortion);
+-    }
+-    WAIT_THREAD(threads, thread_num);
+-
+-  } else {
+-    /////////////////////////////////
+-    Float jcv[16 + 8];  // size_t offset = ((size_t) jcv) & 0xf;
+-    // Float* jcx = jcv + ((16 - offset) / sizeof(Float)), * jcy = jcx + 8;
+-    Float *jcx = (Float *)ALIGN_PTR(jcv), *jcy = jcx + 8;
+-
+-    for (size_t i = 0; i < ncam; ++i, ++cmap, jte += 8, c += 16) {
+-      int idx1 = cmap[0], idx2 = cmap[1];
+-
+-      for (int j = idx1; j < idx2; ++j) {
+-        int index = cmlist[j];
+-        const Float* pt = point + jmap[2 * index + 1] * POINT_ALIGN;
+-        const Float* e = ee + index * 2;
+-
+-        JacobianOne(c, pt, ms + index * 2, jcx, jcy, (Float*)NULL, (Float*)NULL,
+-                    intrinsic_fixed, radial_distortion);
+-
+-        //////////////////////////////
+-        AddScaledVec8(e[0], jcx, jte);
+-        AddScaledVec8(e[1], jcy, jte);
+-      }
+-    }
+-  }
+-}
+-
+-template <class Float>
+-void ComputeJtE_(size_t nproj, size_t ncam, size_t npt, const Float* ee,
+-                 Float* jte, const Float* camera, const Float* point,
+-                 const Float* ms, const int* jmap, const int* cmap,
+-                 const int* cmlist, const int* pmap, const Float* jp,
+-                 bool intrinsic_fixed, int radial_distortion, int mode,
+-                 int mt) {
+-  if (mode != 2) {
+-    SetVectorZero(jte, jte + ncam * 8);
+-    ComputeJtEC_(ncam, ee, jte, camera, point, ms, jmap, cmap, cmlist,
+-                 intrinsic_fixed, radial_distortion, mt);
+-  }
+-  if (mode != 1) {
+-    ComputeJtEP(npt, ee, jp, pmap, jte + 8 * ncam, mt);
+-  }
+-}
+-
+-template <class Float>
+-void ComputeJtE_(size_t nproj, size_t ncam, size_t npt, const Float* ee,
+-                 Float* jte, const Float* camera, const Float* point,
+-                 const Float* ms, const int* jmap, bool intrinsic_fixed,
+-                 int radial_distortion, int mode) {
+-  SetVectorZero(jte, jte + (ncam * 8 + npt * POINT_ALIGN));
+-  Float jcv[24 + 8];  // size_t offset = ((size_t) jcv) & 0xf;
+-  // Float* jc = jcv + (16 - offset) / sizeof(Float), *pj = jc + 16;
+-  Float *jc = (Float *)ALIGN_PTR(jcv), *pj = jc + 16;
+-
+-  Float *vc0 = jte, *vp0 = jte + ncam * 8;
+-
+-  for (size_t i = 0; i < nproj; ++i, jmap += 2, ms += 2, ee += 2) {
+-    int cidx = jmap[0], pidx = jmap[1];
+-    const Float *c = camera + cidx * 16, *pt = point + pidx * POINT_ALIGN;
+-
+-    if (mode == 0) {
+-      /////////////////////////////////////////////////////
+-      JacobianOne(c, pt, ms, jc, jc + 8, pj, pj + POINT_ALIGN, intrinsic_fixed,
+-                  radial_distortion);
+-
+-      ////////////////////////////////////////////
+-      Float *vc = vc0 + cidx * 8, *vp = vp0 + pidx * POINT_ALIGN;
+-      AddScaledVec8(ee[0], jc, vc);
+-      AddScaledVec8(ee[1], jc + 8, vc);
+-      vp[0] += (ee[0] * pj[0] + ee[1] * pj[POINT_ALIGN]);
+-      vp[1] += (ee[0] * pj[1] + ee[1] * pj[POINT_ALIGN + 1]);
+-      vp[2] += (ee[0] * pj[2] + ee[1] * pj[POINT_ALIGN + 2]);
+-    } else if (mode == 1) {
+-      /////////////////////////////////////////////////////
+-      JacobianOne(c, pt, ms, jc, jc + 8, (Float*)NULL, (Float*)NULL,
+-                  intrinsic_fixed, radial_distortion);
+-
+-      ////////////////////////////////////////////
+-      Float* vc = vc0 + cidx * 8;
+-      AddScaledVec8(ee[0], jc, vc);
+-      AddScaledVec8(ee[1], jc + 8, vc);
+-    } else {
+-      /////////////////////////////////////////////////////
+-      JacobianOne(c, pt, ms, (Float*)NULL, (Float*)NULL, pj, pj + POINT_ALIGN,
+-                  intrinsic_fixed, radial_distortion);
+-
+-      ////////////////////////////////////////////
+-      Float* vp = vp0 + pidx * POINT_ALIGN;
+-      vp[0] += (ee[0] * pj[0] + ee[1] * pj[POINT_ALIGN]);
+-      vp[1] += (ee[0] * pj[1] + ee[1] * pj[POINT_ALIGN + 1]);
+-      vp[2] += (ee[0] * pj[2] + ee[1] * pj[POINT_ALIGN + 2]);
+-    }
+-  }
+-}
+-};
+-
+-using namespace ProgramCPU;
+-
+-template <class Float>
+-SparseBundleCPU<Float>::SparseBundleCPU(const int num_threads)
+-    : ParallelBA(PBA_INVALID_DEVICE),
+-      _num_camera(0),
+-      _num_point(0),
+-      _num_imgpt(0),
+-      _num_imgpt_q(0),
+-      _camera_data(NULL),
+-      _point_data(NULL),
+-      _imgpt_data(NULL),
+-      _camera_idx(NULL),
+-      _point_idx(NULL),
+-      _projection_sse(0) {
+-  __cpu_data_precision = sizeof(Float);
+-  if (num_threads <= 0) {
+-    __num_cpu_cores = FindProcessorCoreNum();
+-  } else {
+-    __num_cpu_cores = num_threads;
+-  }
+-  if (__verbose_level)
+-    std::cout << "CPU " << (__cpu_data_precision == 4 ? "single" : "double")
+-              << "-precision solver; " << __num_cpu_cores << " cores"
+-#ifdef CPUPBA_USE_AVX
+-              << " (AVX)"
+-#endif
+-              << ".\n";
+-  // the following configuration are totally based my personal experience
+-  // on two computers.. you should adjust them according to your system.
+-  // try run driver filename -profile --float to see how speed varies
+-  ////////////////////////////////////////
+-  __num_cpu_thread[FUNC_JX] = __num_cpu_cores;
+-  __num_cpu_thread[FUNC_JX_] = __num_cpu_cores;
+-  __num_cpu_thread[FUNC_JTE_] = __num_cpu_cores;
+-  __num_cpu_thread[FUNC_JJ_JCO_JCT_JP] = __num_cpu_cores;
+-  __num_cpu_thread[FUNC_JJ_JCO_JP] = __num_cpu_cores;
+-  __num_cpu_thread[FUNC_JJ_JCT_JP] = __num_cpu_cores;
+-  __num_cpu_thread[FUNC_JJ_JP] = __num_cpu_cores;
+-  __num_cpu_thread[FUNC_PJ] = __num_cpu_cores;
+-  __num_cpu_thread[FUNC_BCC_JCO] = __num_cpu_cores;
+-  __num_cpu_thread[FUNC_BCC_JCT] = __num_cpu_cores;
+-  __num_cpu_thread[FUNC_BCP] = __num_cpu_cores;
+-
+-  ////this behavious is different between CPU and GPU
+-  __multiply_jx_usenoj = false;
+-
+-  ///////////////////////////////////////////////////////////////////////////////
+-  // To get the best performance, you should ajust the number of threads
+-  // Linux and Windows may also have different thread launching overhead.
+-
+-  //////////////////////////////////////////////////////////////
+-  __num_cpu_thread[FUNC_JTEC_JCT] = __num_cpu_cores * 2;
+-  __num_cpu_thread[FUNC_JTEC_JCO] = __num_cpu_cores * 2;
+-  __num_cpu_thread[FUNC_JTEP] = __num_cpu_cores;
+-
+-  ///////////
+-  __num_cpu_thread[FUNC_MPC] =
+-      1;  // single thread always faster with my experience
+-
+-  // see the AUTO_MT_NUM marcro for definition
+-  __num_cpu_thread[FUNC_MPP] = 0;  // automatically chosen according to size
+-  __num_cpu_thread[FUNC_VS] = 0;  // automatically chosen according to size
+-  __num_cpu_thread[FUNC_VV] = 0;  // automatically chosen accodring to size
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::SetCameraData(size_t ncam, CameraT* cams) {
+-  if (sizeof(CameraT) != 16 * sizeof(float)) return;  // never gonna happen...?
+-  _num_camera = (int)ncam;
+-  _camera_data = cams;
+-  _focal_mask = NULL;
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::SetFocalMask(const int* fmask, float weight) {
+-  _focal_mask = fmask;
+-  _weight_q = weight;
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::SetPointData(size_t npoint, Point3D* pts) {
+-  _num_point = (int)npoint;
+-  _point_data = (float*)pts;
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::SetProjection(size_t nproj, const Point2D* imgpts,
+-                                           const int* point_idx,
+-                                           const int* cam_idx) {
+-  _num_imgpt = (int)nproj;
+-  _imgpt_data = (float*)imgpts;
+-  _camera_idx = cam_idx;
+-  _point_idx = point_idx;
+-}
+-
+-template <class Float>
+-float SparseBundleCPU<Float>::GetMeanSquaredError() {
+-  return float(_projection_sse /
+-               (_num_imgpt * __focal_scaling * __focal_scaling));
+-}
+-
+-template <class Float>
+-int SparseBundleCPU<Float>::RunBundleAdjustment() {
+-  ResetBundleStatistics();
+-  BundleAdjustment();
+-  if (__num_lm_success > 0)
+-    SaveBundleStatistics(_num_camera, _num_point, _num_imgpt);
+-  if (__num_lm_success > 0) PrintBundleStatistics();
+-  ResetTemporarySetting();
+-  return __num_lm_success;
+-}
+-
+-template <class Float>
+-int SparseBundleCPU<Float>::ValidateInputData() {
+-  if (_camera_data == NULL) return STATUS_CAMERA_MISSING;
+-  if (_point_data == NULL) return STATUS_POINT_MISSING;
+-  if (_imgpt_data == NULL) return STATUS_MEASURMENT_MISSING;
+-  if (_camera_idx == NULL || _point_idx == NULL)
+-    return STATUS_PROJECTION_MISSING;
+-  return STATUS_SUCCESS;
+-}
+-
+-template <class Float>
+-int SparseBundleCPU<Float>::InitializeBundle() {
+-  /////////////////////////////////////////////////////
+-  TimerBA timer(this, TIMER_GPU_ALLOCATION);
+-  InitializeStorageForSFM();
+-  InitializeStorageForCG();
+-
+-  if (__debug_pba) DumpCooJacobian();
+-
+-  return STATUS_SUCCESS;
+-}
+-
+-template <class Float>
+-int SparseBundleCPU<Float>::GetParameterLength() {
+-  return _num_camera * 8 + POINT_ALIGN * _num_point;
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::BundleAdjustment() {
+-  if (ValidateInputData() != STATUS_SUCCESS) return;
+-
+-  ////////////////////////
+-  TimerBA timer(this, TIMER_OVERALL);
+-
+-  NormalizeData();
+-  if (InitializeBundle() != STATUS_SUCCESS) {
+-    // failed to allocate gpu storage
+-  } else if (__profile_pba) {
+-    // profiling some stuff
+-    RunProfileSteps();
+-  } else {
+-    // real optimization
+-    AdjustBundleAdjsutmentMode();
+-    NonlinearOptimizeLM();
+-    TransferDataToHost();
+-  }
+-  DenormalizeData();
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::NormalizeData() {
+-  TimerBA timer(this, TIMER_PREPROCESSING);
+-  NormalizeDataD();
+-  NormalizeDataF();
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::TransferDataToHost() {
+-  TimerBA timer(this, TIMER_GPU_DOWNLOAD);
+-  std::copy(_cuCameraData.begin(), _cuCameraData.end(), ((float*)_camera_data));
+-#ifdef POINT_DATA_ALIGN4
+-  std::copy(_cuPointData.begin(), _cuPointData.end(), _point_data);
+-#else
+-  for (size_t i = 0, j = 0; i < _cuPointData.size(); j++) {
+-    _point_data[j++] = (float)_cuPointData[i++];
+-    _point_data[j++] = (float)_cuPointData[i++];
+-    _point_data[j++] = (float)_cuPointData[i++];
+-  }
+-#endif
+-}
+-
+-#define ALLOCATE_REQUIRED_DATA(NAME, num, channels) \
+-  {                                                 \
+-    NAME.resize((num) * (channels));                \
+-    total_sz += NAME.size() * sizeof(Float);        \
+-  }
+-#define ALLOCATE_OPTIONAL_DATA(NAME, num, channels, option)      \
+-  if (option) ALLOCATE_REQUIRED_DATA(NAME, num, channels) else { \
+-      NAME.resize(0);                                            \
+-    }
+-//////////////////////////////////////////////
+-template <class Float>
+-bool SparseBundleCPU<Float>::InitializeStorageForSFM() {
+-  size_t total_sz = 0;
+-  //////////////////////////////////////////////////
+-  ProcessIndexCameraQ(_cuCameraQMap, _cuCameraQList);
+-  total_sz += ((_cuCameraQMap.size() + _cuCameraQList.size()) * sizeof(int) /
+-               1024 / 1024);
+-
+-  ///////////////////////////////////////////////////////////////////
+-  ALLOCATE_REQUIRED_DATA(_cuPointData, _num_point, POINT_ALIGN);  // 4n
+-  ALLOCATE_REQUIRED_DATA(_cuCameraData, _num_camera, 16);  // 16m
+-  ALLOCATE_REQUIRED_DATA(_cuCameraDataEX, _num_camera, 16);  // 16m
+-
+-  ////////////////////////////////////////////////////////////////
+-  ALLOCATE_REQUIRED_DATA(_cuCameraMeasurementMap, _num_camera + 1, 1);  // m
+-  ALLOCATE_REQUIRED_DATA(_cuCameraMeasurementList, _num_imgpt, 1);  // k
+-  ALLOCATE_REQUIRED_DATA(_cuPointMeasurementMap, _num_point + 1, 1);  // n
+-  ALLOCATE_REQUIRED_DATA(_cuProjectionMap, _num_imgpt, 2);  // 2k
+-  ALLOCATE_REQUIRED_DATA(_cuImageProj, _num_imgpt + _num_imgpt_q, 2);  // 2k
+-  ALLOCATE_REQUIRED_DATA(_cuPointDataEX, _num_point, POINT_ALIGN);  // 4n
+-  ALLOCATE_REQUIRED_DATA(_cuMeasurements, _num_imgpt, 2);  // 2k
+-  ALLOCATE_REQUIRED_DATA(_cuCameraQMapW, _num_imgpt_q, 2);
+-  ALLOCATE_REQUIRED_DATA(_cuCameraQListW, (_num_imgpt_q > 0 ? _num_camera : 0),
+-                         2);
+-
+-  ALLOCATE_OPTIONAL_DATA(_cuJacobianPoint, _num_imgpt * 2, POINT_ALIGN,
+-                         !__no_jacobian_store);  // 8k
+-  ALLOCATE_OPTIONAL_DATA(_cuJacobianCameraT, _num_imgpt * 2, 8,
+-                         !__no_jacobian_store && __jc_store_transpose);  // 16k
+-  ALLOCATE_OPTIONAL_DATA(_cuJacobianCamera, _num_imgpt * 2, 8,
+-                         !__no_jacobian_store && __jc_store_original);  // 16k
+-  ALLOCATE_OPTIONAL_DATA(_cuCameraMeasurementListT, _num_imgpt, 1,
+-                         __jc_store_transpose);  // k
+-
+-  //////////////////////////////////////////
+-  BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION);
+-  ////mapping from camera to measuremnts
+-  vector<int>& cpi = _cuCameraMeasurementMap;
+-  cpi.resize(_num_camera + 1);
+-  vector<int>& cpidx = _cuCameraMeasurementList;
+-  cpidx.resize(_num_imgpt);
+-  vector<int> cpnum(_num_camera, 0);
+-  cpi[0] = 0;
+-  for (int i = 0; i < _num_imgpt; ++i) cpnum[_camera_idx[i]]++;
+-  for (int i = 1; i <= _num_camera; ++i) cpi[i] = cpi[i - 1] + cpnum[i - 1];
+-  ///////////////////////////////////////////////////////
+-  vector<int> cptidx = cpi;
+-  for (int i = 0; i < _num_imgpt; ++i) cpidx[cptidx[_camera_idx[i]]++] = i;
+-
+-  ///////////////////////////////////////////////////////////
+-  if (_cuCameraMeasurementListT.size()) {
+-    vector<int>& ridx = _cuCameraMeasurementListT;
+-    ridx.resize(_num_imgpt);
+-    for (int i = 0; i < _num_imgpt; ++i) ridx[cpidx[i]] = i;
+-  }
+-
+-  ////////////////////////////////////////
+-  /////constaraint weights.
+-  if (_num_imgpt_q > 0)
+-    ProcessWeightCameraQ(cpnum, _cuCameraQMap, _cuCameraQMapW.begin(),
+-                         _cuCameraQListW.begin());
+-
+-  ///////////////////////////////////////////////////////////////////////////////
+-  std::copy((float*)_camera_data, ((float*)_camera_data) + _cuCameraData.size(),
+-            _cuCameraData.begin());
+-
+-#ifdef POINT_DATA_ALIGN4
+-  std::copy(_point_data, _point_data + _cuPointData.size(),
+-            _cuPointData.begin());
+-#else
+-  for (size_t i = 0, j = 0; i < _cuPointData.size(); j++) {
+-    _cuPointData[i++] = _point_data[j++];
+-    _cuPointData[i++] = _point_data[j++];
+-    _cuPointData[i++] = _point_data[j++];
+-  }
+-#endif
+-
+-  ////////////////////////////////////////////
+-  ///////mapping from point to measurment
+-  vector<int>& ppi = _cuPointMeasurementMap;
+-  ppi.resize(_num_point + 1);
+-  for (int i = 0, last_point = -1; i < _num_imgpt; ++i) {
+-    int pt = _point_idx[i];
+-    while (last_point < pt) ppi[++last_point] = i;
+-  }
+-  ppi[_num_point] = _num_imgpt;
+-
+-  //////////projection map
+-  vector<int>& pmp = _cuProjectionMap;
+-  pmp.resize(_num_imgpt * 2);
+-  for (int i = 0; i < _num_imgpt; ++i) {
+-    int* imp = &pmp[i * 2];
+-    imp[0] = _camera_idx[i];
+-    imp[1] = _point_idx[i];
+-  }
+-  BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION);
+-  //////////////////////////////////////////////////////////////
+-
+-  __memory_usage = total_sz;
+-  if (__verbose_level > 1)
+-    std::cout << "Memory for Motion/Structure/Jacobian:\t"
+-              << (total_sz / 1024 / 1024) << "MB\n";
+-
+-  return true;
+-}
+-
+-template <class Float>
+-bool SparseBundleCPU<Float>::ProcessIndexCameraQ(vector<int>& qmap,
+-                                                 vector<int>& qlist) {
+-  ///////////////////////////////////
+-  qlist.resize(0);
+-  qmap.resize(0);
+-  _num_imgpt_q = 0;
+-
+-  if (_camera_idx == NULL) return true;
+-  if (_point_idx == NULL) return true;
+-  if (_focal_mask == NULL) return true;
+-  if (_num_camera == 0) return true;
+-  if (_weight_q <= 0) return true;
+-
+-  ///////////////////////////////////////
+-
+-  int error = 0;
+-  vector<int> temp(_num_camera * 2, -1);
+-
+-  for (int i = 0; i < _num_camera; ++i) {
+-    int iq = _focal_mask[i];
+-    if (iq > i) {
+-      error = 1;
+-      break;
+-    }
+-    if (iq < 0) continue;
+-    if (iq == i) continue;
+-    int ip = temp[2 * iq];
+-    // float ratio = _camera_data[i].f / _camera_data[iq].f;
+-    // if(ratio < 0.01 || ratio > 100)
+-    //{
+-    //  std::cout << "Warning: constaraints on largely different camreas\n";
+-    //  continue;
+-    //}else
+-    if (_focal_mask[iq] != iq) {
+-      error = 1;
+-      break;
+-    } else if (ip == -1) {
+-      temp[2 * iq] = i;
+-      temp[2 * iq + 1] = i;
+-      temp[2 * i] = iq;
+-      temp[2 * i + 1] = iq;
+-    } else {
+-      // maintain double-linked list
+-      temp[2 * i] = ip;
+-      temp[2 * i + 1] = iq;
+-      temp[2 * ip + 1] = i;
+-      temp[2 * iq] = i;
+-    }
+-  }
+-
+-  if (error) {
+-    std::cout << "Error: incorrect constraints\n";
+-    _focal_mask = NULL;
+-    return false;
+-  }
+-
+-  ////////////////////////////////////////
+-  qlist.resize(_num_camera * 2, -1);
+-  for (int i = 0; i < _num_camera; ++i) {
+-    int inext = temp[2 * i + 1];
+-    if (inext == -1) continue;
+-    qlist[2 * i] = _num_imgpt_q;
+-    qlist[2 * inext + 1] = _num_imgpt_q;
+-    qmap.push_back(i);
+-    qmap.push_back(inext);
+-    _num_imgpt_q++;
+-  }
+-  return true;
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::ProcessWeightCameraQ(vector<int>& cpnum,
+-                                                  vector<int>& qmap,
+-                                                  Float* qmapw, Float* qlistw) {
+-  // set average focal length and average radial distortion
+-  vector<Float> qpnum(_num_camera, 0), qcnum(_num_camera, 0);
+-  vector<Float> fs(_num_camera, 0), rs(_num_camera, 0);
+-
+-  for (int i = 0; i < _num_camera; ++i) {
+-    int qi = _focal_mask[i];
+-    if (qi == -1) continue;
+-    // float ratio = _camera_data[i].f / _camera_data[qi].f;
+-    // if(ratio < 0.01 || ratio > 100)      continue;
+-    fs[qi] += _camera_data[i].f;
+-    rs[qi] += _camera_data[i].radial;
+-    qpnum[qi] += cpnum[i];
+-    qcnum[qi] += 1.0f;
+-  }
+-
+-  // this seems not really matter..they will converge anyway
+-  for (int i = 0; i < _num_camera; ++i) {
+-    int qi = _focal_mask[i];
+-    if (qi == -1) continue;
+-    // float ratio = _camera_data[i].f / _camera_data[qi].f;
+-    // if(ratio < 0.01 || ratio > 100)      continue;
+-    _camera_data[i].f = fs[qi] / qcnum[qi];
+-    _camera_data[i].radial = rs[qi] / qcnum[qi];
+-  } /**/
+-
+-  /////////////////////////////////////////
+-  std::fill(qlistw, qlistw + _num_camera * 2, 0);
+-
+-  for (int i = 0; i < _num_imgpt_q; ++i) {
+-    int cidx = qmap[i * 2], qi = _focal_mask[cidx];
+-    Float wi = sqrt(qpnum[qi] / qcnum[qi]) * _weight_q;
+-    Float wr = (__use_radial_distortion ? wi * _camera_data[qi].f : 0.0);
+-    qmapw[i * 2] = wi;
+-    qmapw[i * 2 + 1] = wr;
+-    qlistw[cidx * 2] = wi;
+-    qlistw[cidx * 2 + 1] = wr;
+-  }
+-}
+-
+-/////////////////////////////////////////////////
+-template <class Float>
+-bool SparseBundleCPU<Float>::InitializeStorageForCG() {
+-  size_t total_sz = 0;
+-  int plen = GetParameterLength();  // q = 8m + 3n
+-
+-  //////////////////////////////////////////// 6q
+-  ALLOCATE_REQUIRED_DATA(_cuVectorJtE, plen, 1);
+-  ALLOCATE_REQUIRED_DATA(_cuVectorXK, plen, 1);
+-  ALLOCATE_REQUIRED_DATA(_cuVectorJJ, plen, 1);
+-  ALLOCATE_REQUIRED_DATA(_cuVectorZK, plen, 1);
+-  ALLOCATE_REQUIRED_DATA(_cuVectorPK, plen, 1);
+-  ALLOCATE_REQUIRED_DATA(_cuVectorRK, plen, 1);
+-
+-  ///////////////////////////////////////////
+-  unsigned int cblock_len = (__use_radial_distortion ? 64 : 56);
+-  ALLOCATE_REQUIRED_DATA(_cuBlockPC, _num_camera * cblock_len + 6 * _num_point,
+-                         1);  // 64m + 12n
+-  ALLOCATE_REQUIRED_DATA(_cuVectorJX, _num_imgpt + _num_imgpt_q, 2);  // 2k
+-  ALLOCATE_OPTIONAL_DATA(_cuVectorSJ, plen, 1, __jacobian_normalize);
+-
+-  /////////////////////////////////////////
+-  __memory_usage += total_sz;
+-  if (__verbose_level > 1)
+-    std::cout << "Memory for Conjugate Gradient Solver:\t"
+-              << (total_sz / 1024 / 1024) << "MB\n";
+-  return true;
+-}
+-
+-///////////////////////////////////////////////////
+-template <class Float>
+-void SparseBundleCPU<Float>::PrepareJacobianNormalization() {
+-  if (!_cuVectorSJ.size()) return;
+-
+-  if ((__jc_store_transpose || __jc_store_original) &&
+-      _cuJacobianPoint.size() && !__bundle_current_mode) {
+-    VectorF null;
+-    null.swap(_cuVectorSJ);
+-    EvaluateJacobians();
+-    null.swap(_cuVectorSJ);
+-    ComputeDiagonal(_cuVectorSJ);
+-    ComputeSQRT(_cuVectorSJ);
+-  } else {
+-    VectorF null;
+-    null.swap(_cuVectorSJ);
+-    EvaluateJacobians();
+-    ComputeBlockPC(0, true);
+-    null.swap(_cuVectorSJ);
+-    _cuVectorJJ.swap(_cuVectorSJ);
+-    ComputeRSQRT(_cuVectorSJ);
+-  }
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::EvaluateJacobians() {
+-  if (__no_jacobian_store) return;
+-  if (__bundle_current_mode == BUNDLE_ONLY_MOTION && !__jc_store_original &&
+-      !__jc_store_transpose)
+-    return;
+-
+-  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JJ, true);
+-
+-  if (__jc_store_original || !__jc_store_transpose) {
+-    int fid = __jc_store_original
+-                  ? (__jc_store_transpose ? FUNC_JJ_JCO_JCT_JP : FUNC_JJ_JCO_JP)
+-                  : FUNC_JJ_JP;
+-    ComputeJacobian(
+-        _num_imgpt, _num_camera, _cuCameraData.begin(), _cuPointData.begin(),
+-        _cuJacobianCamera.begin(), _cuJacobianPoint.begin(),
+-        &_cuProjectionMap.front(), _cuVectorSJ.begin(), _cuMeasurements.begin(),
+-        __jc_store_transpose ? &_cuCameraMeasurementListT.front() : NULL,
+-        __fixed_intrinsics, __use_radial_distortion, false,
+-        _cuJacobianCameraT.begin(), __num_cpu_thread[fid]);
+-  } else {
+-    ComputeJacobian(_num_imgpt, _num_camera, _cuCameraData.begin(),
+-                    _cuPointData.begin(), _cuJacobianCameraT.begin(),
+-                    _cuJacobianPoint.begin(), &_cuProjectionMap.front(),
+-                    _cuVectorSJ.begin(), _cuMeasurements.begin(),
+-                    &_cuCameraMeasurementListT.front(), __fixed_intrinsics,
+-                    __use_radial_distortion, true, ((Float*)0),
+-                    __num_cpu_thread[FUNC_JJ_JCT_JP]);
+-  }
+-  ++__num_jacobian_eval;
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::ComputeJtE(VectorF& E, VectorF& JtE, int mode) {
+-  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JTE, true);
+-  if (mode == 0) mode = __bundle_current_mode;
+-
+-  if (__no_jacobian_store || (!__jc_store_original && !__jc_store_transpose)) {
+-    if (_cuJacobianPoint.size()) {
+-      ProgramCPU::ComputeJtE_(
+-          _num_imgpt, _num_camera, _num_point, E.begin(), JtE.begin(),
+-          _cuCameraData.begin(), _cuPointData.begin(), _cuMeasurements.begin(),
+-          &_cuProjectionMap.front(), &_cuCameraMeasurementMap.front(),
+-          &_cuCameraMeasurementList.front(), &_cuPointMeasurementMap.front(),
+-          _cuJacobianPoint.begin(), __fixed_intrinsics, __use_radial_distortion,
+-          mode, __num_cpu_thread[FUNC_JTE_]);
+-
+-      if (_cuVectorSJ.size() && mode != 2)
+-        ProgramCPU::ComputeVXY(JtE, _cuVectorSJ, JtE, _num_camera * 8);
+-    } else {
+-      ProgramCPU::ComputeJtE_(_num_imgpt, _num_camera, _num_point, E.begin(),
+-                              JtE.begin(), _cuCameraData.begin(),
+-                              _cuPointData.begin(), _cuMeasurements.begin(),
+-                              &_cuProjectionMap.front(), __fixed_intrinsics,
+-                              __use_radial_distortion, mode);
+-
+-      //////////////////////////////////////////////////////////
+-      // if(_cuVectorSJ.size())  ProgramCPU::ComputeVXY(JtE, _cuVectorSJ, JtE);
+-      if (!_cuVectorSJ.size()) {
+-      } else if (mode == 2)
+-        ComputeVXY(JtE, _cuVectorSJ, JtE, _num_point * POINT_ALIGN,
+-                   _num_camera * 8);
+-      else if (mode == 1)
+-        ComputeVXY(JtE, _cuVectorSJ, JtE, _num_camera * 8);
+-      else
+-        ComputeVXY(JtE, _cuVectorSJ, JtE);
+-    }
+-  } else if (__jc_store_transpose) {
+-    ProgramCPU::ComputeJtE(
+-        _num_camera, _num_point, E.begin(), _cuJacobianCameraT.begin(),
+-        &_cuCameraMeasurementMap.front(), &_cuCameraMeasurementList.front(),
+-        _cuJacobianPoint.begin(), &_cuPointMeasurementMap.front(), JtE.begin(),
+-        true, mode, __num_cpu_thread[FUNC_JTEC_JCT],
+-        __num_cpu_thread[FUNC_JTEP]);
+-  } else {
+-    ProgramCPU::ComputeJtE(
+-        _num_camera, _num_point, E.begin(), _cuJacobianCamera.begin(),
+-        &_cuCameraMeasurementMap.front(), &_cuCameraMeasurementList.front(),
+-        _cuJacobianPoint.begin(), &_cuPointMeasurementMap.front(), JtE.begin(),
+-        false, mode, __num_cpu_thread[FUNC_JTEC_JCO],
+-        __num_cpu_thread[FUNC_JTEP]);
+-  }
+-
+-  if (mode != 2 && _num_imgpt_q > 0) {
+-    ProgramCPU::ComputeJQtEC(_num_camera, E.begin() + 2 * _num_imgpt,
+-                             &_cuCameraQList.front(), _cuCameraQListW.begin(),
+-                             _cuVectorSJ.begin(), JtE.begin());
+-  }
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::SaveBundleRecord(int iter, float res,
+-                                              float damping, float& g_norm,
+-                                              float& g_inf) {
+-  // do not really compute if parameter not specified...
+-  // for large dataset, it never converges..
+-  g_inf = __lm_check_gradient ? float(ComputeVectorMax(_cuVectorJtE)) : 0;
+-  g_norm =
+-      __save_gradient_norm ? float(ComputeVectorNorm(_cuVectorJtE)) : g_inf;
+-  ConfigBA::SaveBundleRecord(iter, res, damping, g_norm, g_inf);
+-}
+-
+-template <class Float>
+-float SparseBundleCPU<Float>::EvaluateProjection(VectorF& cam, VectorF& point,
+-                                                 VectorF& proj) {
+-  ++__num_projection_eval;
+-  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_PJ, true);
+-  ComputeProjection(_num_imgpt, cam.begin(), point.begin(),
+-                    _cuMeasurements.begin(), &_cuProjectionMap.front(),
+-                    proj.begin(), __use_radial_distortion,
+-                    __num_cpu_thread[FUNC_PJ]);
+-  if (_num_imgpt_q > 0)
+-    ComputeProjectionQ(_num_imgpt_q, cam.begin(), &_cuCameraQMap.front(),
+-                       _cuCameraQMapW.begin(), proj.begin() + 2 * _num_imgpt);
+-  return (float)ComputeVectorNorm(proj, __num_cpu_thread[FUNC_VS]);
+-}
+-
+-template <class Float>
+-float SparseBundleCPU<Float>::EvaluateProjectionX(VectorF& cam, VectorF& point,
+-                                                  VectorF& proj) {
+-  ++__num_projection_eval;
+-  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_PJ, true);
+-  ComputeProjectionX(_num_imgpt, cam.begin(), point.begin(),
+-                     _cuMeasurements.begin(), &_cuProjectionMap.front(),
+-                     proj.begin(), __use_radial_distortion,
+-                     __num_cpu_thread[FUNC_PJ]);
+-  if (_num_imgpt_q > 0)
+-    ComputeProjectionQ(_num_imgpt_q, cam.begin(), &_cuCameraQMap.front(),
+-                       _cuCameraQMapW.begin(), proj.begin() + 2 * _num_imgpt);
+-  return (float)ComputeVectorNorm(proj, __num_cpu_thread[FUNC_VS]);
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::ComputeJX(VectorF& X, VectorF& JX, int mode) {
+-  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JX, true);
+-  if (__no_jacobian_store || (__multiply_jx_usenoj && mode != 2) ||
+-      !__jc_store_original) {
+-    ProgramCPU::ComputeJX_(
+-        _num_imgpt, _num_camera, X.begin(), JX.begin(), _cuCameraData.begin(),
+-        _cuPointData.begin(), _cuMeasurements.begin(), _cuVectorSJ.begin(),
+-        &_cuProjectionMap.front(), __fixed_intrinsics, __use_radial_distortion,
+-        mode, __num_cpu_thread[FUNC_JX_]);
+-  } else {
+-    ProgramCPU::ComputeJX(_num_imgpt, _num_camera, X.begin(),
+-                          _cuJacobianCamera.begin(), _cuJacobianPoint.begin(),
+-                          &_cuProjectionMap.front(), JX.begin(), mode,
+-                          __num_cpu_thread[FUNC_JX]);
+-  }
+-
+-  if (_num_imgpt_q > 0 && mode != 2) {
+-    ProgramCPU::ComputeJQX(_num_imgpt_q, X.begin(), &_cuCameraQMap.front(),
+-                           _cuCameraQMapW.begin(), _cuVectorSJ.begin(),
+-                           JX.begin() + 2 * _num_imgpt);
+-  }
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::ComputeBlockPC(float lambda, bool dampd) {
+-  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_BC, true);
+-
+-  if (__no_jacobian_store || (!__jc_store_original && !__jc_store_transpose &&
+-                              __bundle_current_mode != 2)) {
+-    ComputeDiagonalBlock_(
+-        lambda, dampd, _cuCameraData, _cuPointData, _cuMeasurements,
+-        _cuProjectionMap, _cuVectorSJ, _cuCameraQListW, _cuVectorJJ, _cuBlockPC,
+-        __fixed_intrinsics, __use_radial_distortion, __bundle_current_mode);
+-  } else if (__jc_store_transpose) {
+-    ComputeDiagonalBlock(
+-        _num_camera, _num_point, lambda, dampd, _cuJacobianCameraT.begin(),
+-        &_cuCameraMeasurementMap.front(), _cuJacobianPoint.begin(),
+-        &_cuPointMeasurementMap.front(), &_cuCameraMeasurementList.front(),
+-        _cuVectorSJ.begin(), _cuCameraQListW.begin(), _cuVectorJJ.begin(),
+-        _cuBlockPC.begin(), __use_radial_distortion, true,
+-        __num_cpu_thread[FUNC_BCC_JCT], __num_cpu_thread[FUNC_BCP],
+-        __bundle_current_mode);
+-  } else {
+-    ComputeDiagonalBlock(
+-        _num_camera, _num_point, lambda, dampd, _cuJacobianCamera.begin(),
+-        &_cuCameraMeasurementMap.front(), _cuJacobianPoint.begin(),
+-        &_cuPointMeasurementMap.front(), &_cuCameraMeasurementList.front(),
+-        _cuVectorSJ.begin(), _cuCameraQListW.begin(), _cuVectorJJ.begin(),
+-        _cuBlockPC.begin(), __use_radial_distortion, false,
+-        __num_cpu_thread[FUNC_BCC_JCO], __num_cpu_thread[FUNC_BCP],
+-        __bundle_current_mode);
+-  }
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::ApplyBlockPC(VectorF& v, VectorF& pv, int mode) {
+-  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_MP, true);
+-  MultiplyBlockConditioner(_num_camera, _num_point, _cuBlockPC.begin(),
+-                           v.begin(), pv.begin(), __use_radial_distortion, mode,
+-                           __num_cpu_thread[FUNC_MPC],
+-                           __num_cpu_thread[FUNC_MPP]);
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::ComputeDiagonal(VectorF& JJ) {
+-  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_DD, true);
+-  if (__no_jacobian_store) {
+-  } else if (__jc_store_transpose) {
+-    ProgramCPU::ComputeDiagonal(
+-        _cuJacobianCameraT, _cuCameraMeasurementMap, _cuJacobianPoint,
+-        _cuPointMeasurementMap, _cuCameraMeasurementList,
+-        _cuCameraQListW.begin(), JJ, true, __use_radial_distortion);
+-  } else if (__jc_store_original) {
+-    ProgramCPU::ComputeDiagonal(
+-        _cuJacobianCamera, _cuCameraMeasurementMap, _cuJacobianPoint,
+-        _cuPointMeasurementMap, _cuCameraMeasurementList,
+-        _cuCameraQListW.begin(), JJ, false, __use_radial_distortion);
+-  }
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::NormalizeDataF() {
+-  int incompatible_radial_distortion = 0;
+-  _cuMeasurements.resize(_num_imgpt * 2);
+-  if (__focal_normalize) {
+-    if (__focal_scaling == 1.0f) {
+-      //------------------------------------------------------------------
+-      //////////////////////////////////////////////////////////////
+-      vector<float> focals(_num_camera);
+-      for (int i = 0; i < _num_camera; ++i) focals[i] = _camera_data[i].f;
+-      std::nth_element(focals.begin(), focals.begin() + _num_camera / 2,
+-                       focals.end());
+-      float median_focal_length = focals[_num_camera / 2];
+-      __focal_scaling = __data_normalize_median / median_focal_length;
+-      Float radial_factor = median_focal_length * median_focal_length * 4.0f;
+-
+-      ///////////////////////////////
+-
+-      for (int i = 0; i < _num_imgpt * 2; ++i) {
+-        _cuMeasurements[i] = Float(_imgpt_data[i] * __focal_scaling);
+-      }
+-      for (int i = 0; i < _num_camera; ++i) {
+-        _camera_data[i].f *= __focal_scaling;
+-        if (!__use_radial_distortion) {
+-        } else if (__reset_initial_distortion) {
+-          _camera_data[i].radial = 0;
+-        } else if (_camera_data[i].distortion_type != __use_radial_distortion) {
+-          incompatible_radial_distortion++;
+-          _camera_data[i].radial = 0;
+-        } else if (__use_radial_distortion == -1) {
+-          _camera_data[i].radial *= radial_factor;
+-        }
+-      }
+-      if (__verbose_level > 2)
+-        std::cout << "Focal length normalized by " << __focal_scaling << '\n';
+-      __reset_initial_distortion = false;
+-    }
+-  } else {
+-    if (__use_radial_distortion) {
+-      for (int i = 0; i < _num_camera; ++i) {
+-        if (__reset_initial_distortion) {
+-          _camera_data[i].radial = 0;
+-        } else if (_camera_data[i].distortion_type != __use_radial_distortion) {
+-          _camera_data[i].radial = 0;
+-          incompatible_radial_distortion++;
+-        }
+-      }
+-      __reset_initial_distortion = false;
+-    }
+-    std::copy(_imgpt_data, _imgpt_data + _cuMeasurements.size(),
+-              _cuMeasurements.begin());
+-  }
+-
+-  if (incompatible_radial_distortion) {
+-    std::cout << "ERROR: incompatible radial distortion input; reset to 0;\n";
+-  }
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::NormalizeDataD() {
+-  if (__depth_scaling == 1.0f) {
+-    const float dist_bound = 1.0f;
+-    vector<float> oz(_num_imgpt);
+-    vector<float> cpdist1(_num_camera, dist_bound);
+-    vector<float> cpdist2(_num_camera, -dist_bound);
+-    vector<int> camnpj(_num_camera, 0), cambpj(_num_camera, 0);
+-    int bad_point_count = 0;
+-    for (int i = 0; i < _num_imgpt; ++i) {
+-      int cmidx = _camera_idx[i];
+-      CameraT* cam = _camera_data + cmidx;
+-      float* rz = cam->m[2];
+-      float* x = _point_data + 4 * _point_idx[i];
+-      oz[i] = (rz[0] * x[0] + rz[1] * x[1] + rz[2] * x[2] + cam->t[2]);
+-
+-      /////////////////////////////////////////////////
+-      // points behind camera may causes big problem
+-      float ozr = oz[i] / cam->t[2];
+-      if (fabs(ozr) < __depth_check_epsilon) {
+-        bad_point_count++;
+-        float px = cam->f * (cam->m[0][0] * x[0] + cam->m[0][1] * x[1] +
+-                             cam->m[0][2] * x[2] + cam->t[0]);
+-        float py = cam->f * (cam->m[1][0] * x[0] + cam->m[1][1] * x[1] +
+-                             cam->m[1][2] * x[2] + cam->t[1]);
+-        float mx = _imgpt_data[i * 2], my = _imgpt_data[2 * i + 1];
+-        bool checkx = fabs(mx) > fabs(my);
+-        if ((checkx && px * oz[i] * mx < 0 && fabs(mx) > 64) ||
+-            (!checkx && py * oz[i] * my < 0 && fabs(my) > 64)) {
+-          if (__verbose_level > 3)
+-            std::cout << "Warning: proj of #" << cmidx
+-                      << " on the wrong side, oz = " << oz[i] << " ("
+-                      << (px / oz[i]) << ',' << (py / oz[i]) << ") (" << mx
+-                      << ',' << my << ")\n";
+-          /////////////////////////////////////////////////////////////////////////
+-          if (oz[i] > 0)
+-            cpdist2[cmidx] = 0;
+-          else
+-            cpdist1[cmidx] = 0;
+-        }
+-        if (oz[i] >= 0)
+-          cpdist1[cmidx] = std::min(cpdist1[cmidx], oz[i]);
+-        else
+-          cpdist2[cmidx] = std::max(cpdist2[cmidx], oz[i]);
+-      }
+-      if (oz[i] < 0) {
+-        __num_point_behind++;
+-        cambpj[cmidx]++;
+-      }
+-      camnpj[cmidx]++;
+-    }
+-    if (bad_point_count > 0 && __depth_degeneracy_fix) {
+-      if (!__focal_normalize || !__depth_normalize)
+-        std::cout << "Enable data normalization on degeneracy\n";
+-      __focal_normalize = true;
+-      __depth_normalize = true;
+-    }
+-    if (__depth_normalize) {
+-      std::nth_element(oz.begin(), oz.begin() + _num_imgpt / 2, oz.end());
+-      float oz_median = oz[_num_imgpt / 2];
+-      float shift_min = std::min(oz_median * 0.001f, 1.0f);
+-      float dist_threshold = shift_min * 0.1f;
+-      __depth_scaling = (1.0 / oz_median) / __data_normalize_median;
+-      if (__verbose_level > 2)
+-        std::cout << "Depth normalized by " << __depth_scaling << " ("
+-                  << oz_median << ")\n";
+-
+-      for (int i = 0; i < _num_camera; ++i) {
+-        // move the camera a little bit?
+-        if (!__depth_degeneracy_fix) {
+-        } else if ((cpdist1[i] < dist_threshold ||
+-                    cpdist2[i] > -dist_threshold)) {
+-          float shift_epsilon = fabs(_camera_data[i].t[2] * FLT_EPSILON);
+-          float shift = std::max(shift_min, shift_epsilon);
+-          bool boths =
+-              cpdist1[i] < dist_threshold && cpdist2[i] > -dist_threshold;
+-          _camera_data[i].t[2] += shift;
+-          if (__verbose_level > 3)
+-            std::cout << "Adjust C" << std::setw(5) << i << " by "
+-                      << std::setw(12) << shift << " [B" << std::setw(2)
+-                      << cambpj[i] << "/" << std::setw(5) << camnpj[i] << "] ["
+-                      << (boths ? 'X' : ' ') << "][" << cpdist1[i] << ", "
+-                      << cpdist2[i] << "]\n";
+-          __num_camera_modified++;
+-        }
+-        _camera_data[i].t[0] *= __depth_scaling;
+-        _camera_data[i].t[1] *= __depth_scaling;
+-        _camera_data[i].t[2] *= __depth_scaling;
+-      }
+-      for (int i = 0; i < _num_point; ++i) {
+-        /////////////////////////////////
+-        _point_data[4 * i + 0] *= __depth_scaling;
+-        _point_data[4 * i + 1] *= __depth_scaling;
+-        _point_data[4 * i + 2] *= __depth_scaling;
+-      }
+-    }
+-    if (__num_point_behind > 0)
+-      std::cout << "WARNING: " << __num_point_behind
+-                << " points are behind cameras.\n";
+-    if (__num_camera_modified > 0)
+-      std::cout << "WARNING: " << __num_camera_modified
+-                << " camera moved to avoid degeneracy.\n";
+-  }
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::DenormalizeData() {
+-  if (__focal_normalize && __focal_scaling != 1.0f) {
+-    float squared_focal_factor = (__focal_scaling * __focal_scaling);
+-    for (int i = 0; i < _num_camera; ++i) {
+-      _camera_data[i].f /= __focal_scaling;
+-      if (__use_radial_distortion == -1)
+-        _camera_data[i].radial *= squared_focal_factor;
+-      _camera_data[i].distortion_type = __use_radial_distortion;
+-    }
+-    _projection_sse /= squared_focal_factor;
+-    __focal_scaling = 1.0f;
+-  } else if (__use_radial_distortion) {
+-    for (int i = 0; i < _num_camera; ++i)
+-      _camera_data[i].distortion_type = __use_radial_distortion;
+-  }
+-
+-  if (__depth_normalize && __depth_scaling != 1.0f) {
+-    for (int i = 0; i < _num_camera; ++i) {
+-      _camera_data[i].t[0] /= __depth_scaling;
+-      _camera_data[i].t[1] /= __depth_scaling;
+-      _camera_data[i].t[2] /= __depth_scaling;
+-    }
+-    for (int i = 0; i < _num_point; ++i) {
+-      _point_data[4 * i + 0] /= __depth_scaling;
+-      _point_data[4 * i + 1] /= __depth_scaling;
+-      _point_data[4 * i + 2] /= __depth_scaling;
+-    }
+-    __depth_scaling = 1.0f;
+-  }
+-}
+-
+-template <class Float>
+-int SparseBundleCPU<Float>::SolveNormalEquationPCGX(float lambda) {
+-  //----------------------------------------------------------
+-  //(Jt * J + lambda * diag(Jt * J)) X = Jt * e
+-  //-------------------------------------------------------------
+-  TimerBA timer(this, TIMER_CG_ITERATION);
+-  __recent_cg_status = ' ';
+-
+-  // diagonal for jacobian preconditioning...
+-  int plen = GetParameterLength();
+-  VectorF null;
+-  VectorF& VectorDP = __lm_use_diagonal_damp ? _cuVectorJJ : null;  // diagonal
+-  ComputeBlockPC(lambda, __lm_use_diagonal_damp);
+-
+-  ////////////////////////////////////////////////
+-
+-  ///////////////////////////////////////////////////////
+-  // B = [BC 0 ; 0 BP]
+-  // m = [mc 0; 0 mp];
+-  // A x= BC * x - JcT * Jp * mp * JpT * Jc * x
+-  //   = JcT * Jc x + lambda * D * x + ........
+-  ////////////////////////////////////////////////////////////
+-
+-  VectorF r;
+-  r.set(_cuVectorRK.data(), 8 * _num_camera);
+-  VectorF p;
+-  p.set(_cuVectorPK.data(), 8 * _num_camera);
+-  VectorF z;
+-  z.set(_cuVectorZK.data(), 8 * _num_camera);
+-  VectorF x;
+-  x.set(_cuVectorXK.data(), 8 * _num_camera);
+-  VectorF d;
+-  d.set(VectorDP.data(), 8 * _num_camera);
+-
+-  VectorF& u = _cuVectorRK;
+-  VectorF& v = _cuVectorPK;
+-  VectorF up;
+-  up.set(u.data() + 8 * _num_camera, 3 * _num_point);
+-  VectorF vp;
+-  vp.set(v.data() + 8 * _num_camera, 3 * _num_point);
+-  VectorF uc;
+-  uc.set(z.data(), 8 * _num_camera);
+-
+-  VectorF& e = _cuVectorJX;
+-  VectorF& e2 = _cuImageProj;
+-
+-  ApplyBlockPC(_cuVectorJtE, u, 2);
+-  ComputeJX(u, e, 2);
+-  ComputeJtE(e, uc, 1);
+-  ComputeSAXPY(Float(-1.0f), uc, _cuVectorJtE, r);  // r
+-  ApplyBlockPC(r, p, 1);  // z = p = M r
+-
+-  float_t rtz0 = (float_t)ComputeVectorDot(r, p);  // r(0)' * z(0)
+-  ComputeJX(p, e, 1);  // Jc * x
+-  ComputeJtE(e, u, 2);  // JpT * jc * x
+-  ApplyBlockPC(u, v, 2);
+-  float_t qtq0 =
+-      (float_t)ComputeVectorNorm(e, __num_cpu_thread[FUNC_VS]);  // q(0)' * q(0)
+-  float_t pdp0 = (float_t)ComputeVectorNormW(p, d);  // p(0)' * DDD * p(0)
+-  float_t uv0 = (float_t)ComputeVectorDot(up, vp);
+-  float_t alpha0 = rtz0 / (qtq0 + lambda * pdp0 - uv0);
+-
+-  if (__verbose_cg_iteration)
+-    std::cout << " --0,\t alpha = " << alpha0
+-              << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";
+-  if (!std::isfinite(alpha0)) {
+-    return 0;
+-  }
+-  if (alpha0 == 0) {
+-    __recent_cg_status = 'I';
+-    return 1;
+-  }
+-
+-  ////////////////////////////////////////////////////////////
+-  ComputeSAX((Float)alpha0, p, x);  // x(k+1) = x(k) + a(k) * p(k)
+-  ComputeJX(v, e2, 2);  //                          //Jp * mp * JpT * JcT * p
+-  ComputeSAXPY(Float(-1.0f), e2, e, e, __num_cpu_thread[FUNC_VV]);
+-  ComputeJtE(e, uc, 1);  // JcT * ....
+-  ComputeSXYPZ((Float)lambda, d, p, uc, uc);
+-  ComputeSAXPY((Float)-alpha0, uc, r, r);  // r(k + 1) = r(k) - a(k) * A * pk
+-
+-  //////////////////////////////////////////////////////////////////////////
+-  float_t rtzk = rtz0, rtz_min = rtz0, betak;
+-  int iteration = 1;
+-  ++__num_cg_iteration;
+-
+-  while (true) {
+-    ApplyBlockPC(r, z, 1);
+-
+-    ///////////////////////////////////////////////////////////////////////////
+-    float_t rtzp = rtzk;
+-    rtzk = (float_t)ComputeVectorDot(
+-        r, z);  //[r(k + 1) = M^(-1) * z(k + 1)] * z(k+1)
+-    float_t rtz_ratio = sqrt(fabs(rtzk / rtz0));
+-    if (rtz_ratio < __cg_norm_threshold) {
+-      if (__recent_cg_status == ' ')
+-        __recent_cg_status = iteration < std::min(10, __cg_min_iteration)
+-                                 ? '0' + iteration
+-                                 : 'N';
+-      if (iteration >= __cg_min_iteration) break;
+-    }
+-    ////////////////////////////////////////////////////////////////////////////
+-    betak = rtzk / rtzp;  // beta
+-    rtz_min = std::min(rtz_min, rtzk);
+-
+-    ComputeSAXPY((Float)betak, p, z, p);  // p(k) = z(k) + b(k) * p(k - 1)
+-    ComputeJX(p, e, 1);  // Jc * p
+-    ComputeJtE(e, u, 2);  // JpT * jc * p
+-    ApplyBlockPC(u, v, 2);
+-    //////////////////////////////////////////////////////////////////////
+-
+-    float_t qtqk =
+-        (float_t)ComputeVectorNorm(e, __num_cpu_thread[FUNC_VS]);  // q(k)' q(k)
+-    float_t pdpk = (float_t)ComputeVectorNormW(p, d);  // p(k)' * DDD * p(k)
+-    float_t uvk = (float_t)ComputeVectorDot(up, vp);
+-    float_t alphak = rtzk / (qtqk + lambda * pdpk - uvk);
+-
+-    /////////////////////////////////////////////////////
+-    if (__verbose_cg_iteration)
+-      std::cout << " --" << iteration << ",\t alpha= " << alphak
+-                << ", rtzk/rtz0 = " << rtz_ratio
+-                << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";
+-
+-    ///////////////////////////////////////////////////
+-    if (!std::isfinite(alphak) || rtz_ratio > __cg_norm_guard) {
+-      __recent_cg_status = 'X';
+-      break;
+-    }  // something doesn't converge..
+-
+-    ////////////////////////////////////////////////
+-    ComputeSAXPY((Float)alphak, p, x, x);  // x(k+1) = x(k) + a(k) * p(k)
+-
+-    /////////////////////////////////////////////////
+-    ++iteration;
+-    ++__num_cg_iteration;
+-    if (iteration >= std::min(__cg_max_iteration, plen)) break;
+-
+-    ComputeJX(v, e2, 2);  //                          //Jp * mp * JpT * JcT * p
+-    ComputeSAXPY((Float)-1.0f, e2, e, e, __num_cpu_thread[FUNC_VV]);
+-    ComputeJtE(e, uc, 1);  // JcT * ....
+-    ComputeSXYPZ((Float)lambda, d, p, uc, uc);
+-    ComputeSAXPY((Float)-alphak, uc, r, r);  // r(k + 1) = r(k) - a(k) * A * pk
+-  }
+-
+-  ComputeJX(x, e, 1);
+-  ComputeJtE(e, u, 2);
+-  VectorF jte_p;
+-  jte_p.set(_cuVectorJtE.data() + 8 * _num_camera, _num_point * 3);
+-  ComputeSAXPY((Float)-1.0f, up, jte_p, vp);
+-  ApplyBlockPC(v, _cuVectorXK, 2);
+-  return iteration;
+-}
+-
+-template <class Float>
+-int SparseBundleCPU<Float>::SolveNormalEquationPCGB(float lambda) {
+-  //----------------------------------------------------------
+-  //(Jt * J + lambda * diag(Jt * J)) X = Jt * e
+-  //-------------------------------------------------------------
+-  TimerBA timer(this, TIMER_CG_ITERATION);
+-  __recent_cg_status = ' ';
+-
+-  // diagonal for jacobian preconditioning...
+-  int plen = GetParameterLength();
+-  VectorF null;
+-  VectorF& VectorDP = __lm_use_diagonal_damp ? _cuVectorJJ : null;  // diagonal
+-  VectorF& VectorQK = _cuVectorZK;  // temporary
+-  ComputeBlockPC(lambda, __lm_use_diagonal_damp);
+-
+-  ////////////////////////////////////////////////////////
+-  ApplyBlockPC(_cuVectorJtE,
+-               _cuVectorPK);  // z(0) = p(0) = M * r(0)//r(0) = Jt * e
+-  ComputeJX(_cuVectorPK, _cuVectorJX);  // q(0) = J * p(0)
+-
+-  //////////////////////////////////////////////////
+-  float_t rtz0 =
+-      (float_t)ComputeVectorDot(_cuVectorJtE, _cuVectorPK);  // r(0)' * z(0)
+-  float_t qtq0 = (float_t)ComputeVectorNorm(
+-      _cuVectorJX, __num_cpu_thread[FUNC_VS]);  // q(0)' * q(0)
+-  float_t ptdp0 =
+-      (float_t)ComputeVectorNormW(_cuVectorPK, VectorDP);  // p(0)' * DDD * p(0)
+-  float_t alpha0 = rtz0 / (qtq0 + lambda * ptdp0);
+-
+-  if (__verbose_cg_iteration)
+-    std::cout << " --0,\t alpha = " << alpha0
+-              << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";
+-  if (!std::isfinite(alpha0)) {
+-    return 0;
+-  }
+-  if (alpha0 == 0) {
+-    __recent_cg_status = 'I';
+-    return 1;
+-  }
+-
+-  ////////////////////////////////////////////////////////////
+-
+-  ComputeSAX((Float)alpha0, _cuVectorPK,
+-             _cuVectorXK);  // x(k+1) = x(k) + a(k) * p(k)
+-  ComputeJtE(_cuVectorJX, VectorQK);  // Jt * (J * p0)
+-
+-  ComputeSXYPZ((Float)lambda, VectorDP, _cuVectorPK, VectorQK,
+-               VectorQK);  // Jt * J * p0 + lambda * DDD * p0
+-
+-  ComputeSAXPY(
+-      (Float)-alpha0, VectorQK, _cuVectorJtE,
+-      _cuVectorRK);  // r(k+1) = r(k) - a(k) * (Jt * q(k)  + DDD * p(k)) ;
+-
+-  float_t rtzk = rtz0, rtz_min = rtz0, betak;
+-  int iteration = 1;
+-  ++__num_cg_iteration;
+-
+-  while (true) {
+-    ApplyBlockPC(_cuVectorRK, _cuVectorZK);
+-
+-    ///////////////////////////////////////////////////////////////////////////
+-    float_t rtzp = rtzk;
+-    rtzk = (float_t)ComputeVectorDot(
+-        _cuVectorRK, _cuVectorZK);  //[r(k + 1) = M^(-1) * z(k + 1)] * z(k+1)
+-    float_t rtz_ratio = sqrt(fabs(rtzk / rtz0));
+-    if (rtz_ratio < __cg_norm_threshold) {
+-      if (__recent_cg_status == ' ')
+-        __recent_cg_status = iteration < std::min(10, __cg_min_iteration)
+-                                 ? '0' + iteration
+-                                 : 'N';
+-      if (iteration >= __cg_min_iteration) break;
+-    }
+-    //////////////////////////////////////////////////////////////////////////
+-    betak = rtzk / rtzp;  // beta
+-    rtz_min = std::min(rtz_min, rtzk);
+-
+-    ComputeSAXPY((Float)betak, _cuVectorPK, _cuVectorZK,
+-                 _cuVectorPK);  // p(k) = z(k) + b(k) * p(k - 1)
+-    ComputeJX(_cuVectorPK, _cuVectorJX);  // q(k) = J * p(k)
+-    //////////////////////////////////////////////////////////////////////
+-
+-    float_t qtqk = (float_t)ComputeVectorNorm(
+-        _cuVectorJX, __num_cpu_thread[FUNC_VS]);  // q(k)' q(k)
+-    float_t ptdpk = (float_t)ComputeVectorNormW(
+-        _cuVectorPK, VectorDP);  // p(k)' * DDD * p(k)
+-
+-    float_t alphak = rtzk / (qtqk + lambda * ptdpk);
+-
+-    /////////////////////////////////////////////////////
+-    if (__verbose_cg_iteration)
+-      std::cout << " --" << iteration << ",\t alpha= " << alphak
+-                << ", rtzk/rtz0 = " << rtz_ratio
+-                << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";
+-
+-    ///////////////////////////////////////////////////
+-    if (!std::isfinite(alphak) || rtz_ratio > __cg_norm_guard) {
+-      __recent_cg_status = 'X';
+-      break;
+-    }  // something doesn't converge..
+-
+-    ////////////////////////////////////////////////
+-    ComputeSAXPY((Float)alphak, _cuVectorPK, _cuVectorXK,
+-                 _cuVectorXK);  // x(k+1) = x(k) + a(k) * p(k)
+-
+-    /////////////////////////////////////////////////
+-    ++iteration;
+-    ++__num_cg_iteration;
+-    if (iteration >= std::min(__cg_max_iteration, plen)) break;
+-
+-    if (__cg_recalculate_freq > 0 && iteration % __cg_recalculate_freq == 0) {
+-      ////r = JtE - (Jt J + lambda * D) x
+-      ComputeJX(_cuVectorXK, _cuVectorJX);
+-      ComputeJtE(_cuVectorJX, VectorQK);
+-      ComputeSXYPZ((Float)lambda, VectorDP, _cuVectorXK, VectorQK, VectorQK);
+-      ComputeSAXPY((Float)-1.0f, VectorQK, _cuVectorJtE, _cuVectorRK);
+-    } else {
+-      ComputeJtE(_cuVectorJX, VectorQK);
+-      ComputeSXYPZ((Float)lambda, VectorDP, _cuVectorPK, VectorQK,
+-                   VectorQK);  //
+-      ComputeSAXPY(
+-          (Float)-alphak, VectorQK, _cuVectorRK,
+-          _cuVectorRK);  // r(k+1) = r(k) - a(k) * (Jt * q(k)  + DDD * p(k)) ;
+-    }
+-  }
+-  return iteration;
+-}
+-
+-template <class Float>
+-int SparseBundleCPU<Float>::SolveNormalEquation(float lambda) {
+-  if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
+-    ComputeBlockPC(lambda, __lm_use_diagonal_damp);
+-    ApplyBlockPC(_cuVectorJtE, _cuVectorXK, 1);
+-    return 1;
+-  } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
+-    ComputeBlockPC(lambda, __lm_use_diagonal_damp);
+-    ApplyBlockPC(_cuVectorJtE, _cuVectorXK, 2);
+-    return 1;
+-  } else {
+-    ////solve linear system using Conjugate Gradients
+-    return __cg_schur_complement ? SolveNormalEquationPCGX(lambda)
+-                                 : SolveNormalEquationPCGB(lambda);
+-  }
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::DumpCooJacobian() {
+-  //////////
+-  ofstream jo("j.txt");
+-  int cn = __use_radial_distortion ? 8 : 7;
+-  int width = cn * _num_camera + 3 * _num_point;
+-  jo << "%%MatrixMarket matrix coordinate real general\n";
+-  jo << (_num_imgpt * 2) << " " << width << " " << (cn + 3) * _num_imgpt * 2
+-     << '\n';
+-  for (int i = 0; i < _num_imgpt; ++i) {
+-    int ci = _camera_idx[i];
+-    int pi = _point_idx[i];
+-    int row = i * 2 + 1;
+-    // Float * jc = _cuJacobianCamera.data() + i * 16;
+-    // Float * jp = _cuJacobianPoint.data() + i * 6;
+-    int idx1 = ci * cn;
+-    int idx2 = _num_camera * cn + 3 * pi;
+-
+-    for (int k = 0; k < 2; ++k, ++row) {
+-      for (int j = 0; j < cn; ++j) {
+-        jo << row << " " << (idx1 + j + 1) << " 1\n";
+-      }
+-      for (int j = 0; j < 3; ++j) {
+-        jo << row << " " << (idx2 + j + 1) << " 1\n";
+-      }
+-    }
+-  }
+-
+-  ofstream jt("jt.txt");
+-  jt << "%%MatrixMarket matrix coordinate real general\n";
+-  jt << width << " " << (_num_imgpt * 2) << " " << (cn + 3) * _num_imgpt * 2
+-     << '\n';
+-
+-  int* lisc = &_cuCameraMeasurementList[0];
+-  int* mapc = &_cuCameraMeasurementMap[0];
+-  int* mapp = &_cuPointMeasurementMap[0];
+-
+-  for (int i = 0; i < _num_camera; ++i) {
+-    int c0 = mapc[i];
+-    int c1 = mapc[i + 1];
+-    for (int k = 0; k < cn; ++k) {
+-      int row = i * cn + k + 1;
+-      for (int j = c0; j < c1; ++j)
+-        jt << row << " " << (lisc[j] * 2 + 1) << " 1\n" << row << " "
+-           << (2 * lisc[j] + 2) << " 1\n";
+-      ;
+-    }
+-  }
+-  for (int i = 0; i < _num_point; ++i) {
+-    int p0 = mapp[i];
+-    int p1 = mapp[i + 1];
+-    for (int k = 0; k < 3; ++k) {
+-      int row = i * 3 + _num_camera * cn + k + 1;
+-      for (int j = p0; j < p1; ++j)
+-        jt << row << " " << (2 * j + 1) << " 1\n" << row << " " << (2 * j + 2)
+-           << " 1\n";
+-      ;
+-    }
+-  }
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::RunTestIterationLM(bool reduced) {
+-  EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
+-  EvaluateJacobians();
+-  ComputeJtE(_cuImageProj, _cuVectorJtE);
+-  if (reduced)
+-    SolveNormalEquationPCGX(__lm_initial_damp);
+-  else
+-    SolveNormalEquationPCGB(__lm_initial_damp);
+-  UpdateCameraPoint(_cuVectorZK, _cuImageProj);
+-  ComputeVectorDot(_cuVectorXK, _cuVectorJtE);
+-  ComputeJX(_cuVectorXK, _cuVectorJX);
+-  ComputeVectorNorm(_cuVectorJX, __num_cpu_thread[FUNC_VS]);
+-}
+-
+-template <class Float>
+-float SparseBundleCPU<Float>::UpdateCameraPoint(VectorF& dx,
+-                                                VectorF& cuImageTempProj) {
+-  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_UP, true);
+-
+-  if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
+-    if (__jacobian_normalize)
+-      ComputeVXY(_cuVectorXK, _cuVectorSJ, dx, 8 * _num_camera);
+-    ProgramCPU::UpdateCameraPoint(
+-        _num_camera, _cuCameraData, _cuPointData, dx, _cuCameraDataEX,
+-        _cuPointDataEX, __bundle_current_mode, __num_cpu_thread[FUNC_VV]);
+-    return EvaluateProjection(_cuCameraDataEX, _cuPointData, cuImageTempProj);
+-  } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
+-    if (__jacobian_normalize)
+-      ComputeVXY(_cuVectorXK, _cuVectorSJ, dx, _num_point * POINT_ALIGN,
+-                 _num_camera * 8);
+-    ProgramCPU::UpdateCameraPoint(
+-        _num_camera, _cuCameraData, _cuPointData, dx, _cuCameraDataEX,
+-        _cuPointDataEX, __bundle_current_mode, __num_cpu_thread[FUNC_VV]);
+-    return EvaluateProjection(_cuCameraData, _cuPointDataEX, cuImageTempProj);
+-  } else {
+-    if (__jacobian_normalize) ComputeVXY(_cuVectorXK, _cuVectorSJ, dx);
+-    ProgramCPU::UpdateCameraPoint(
+-        _num_camera, _cuCameraData, _cuPointData, dx, _cuCameraDataEX,
+-        _cuPointDataEX, __bundle_current_mode, __num_cpu_thread[FUNC_VV]);
+-    return EvaluateProjection(_cuCameraDataEX, _cuPointDataEX, cuImageTempProj);
+-  }
+-}
+-
+-template <class Float>
+-float SparseBundleCPU<Float>::SaveUpdatedSystem(float residual_reduction,
+-                                                float dx_sqnorm,
+-                                                float damping) {
+-  float expected_reduction;
+-  if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
+-    VectorF xk;
+-    xk.set(_cuVectorXK.data(), 8 * _num_camera);
+-    VectorF jte;
+-    jte.set(_cuVectorJtE.data(), 8 * _num_camera);
+-    float dxtg = (float)ComputeVectorDot(xk, jte);
+-    if (__lm_use_diagonal_damp) {
+-      VectorF jj;
+-      jj.set(_cuVectorJJ.data(), 8 * _num_camera);
+-      float dq = (float)ComputeVectorNormW(xk, jj);
+-      expected_reduction = damping * dq + dxtg;
+-    } else {
+-      expected_reduction = damping * dx_sqnorm + dxtg;
+-    }
+-    _cuCameraData.swap(_cuCameraDataEX);
+-  } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
+-    VectorF xk;
+-    xk.set(_cuVectorXK.data() + 8 * _num_camera, POINT_ALIGN * _num_point);
+-    VectorF jte;
+-    jte.set(_cuVectorJtE.data() + 8 * _num_camera, POINT_ALIGN * _num_point);
+-    float dxtg = (float)ComputeVectorDot(xk, jte);
+-    if (__lm_use_diagonal_damp) {
+-      VectorF jj;
+-      jj.set(_cuVectorJJ.data() + 8 * _num_camera, POINT_ALIGN * _num_point);
+-      float dq = (float)ComputeVectorNormW(xk, jj);
+-      expected_reduction = damping * dq + dxtg;
+-    } else {
+-      expected_reduction = damping * dx_sqnorm + dxtg;
+-    }
+-    _cuPointData.swap(_cuPointDataEX);
+-  } else {
+-    float dxtg = (float)ComputeVectorDot(_cuVectorXK, _cuVectorJtE);
+-    if (__accurate_gain_ratio) {
+-      ComputeJX(_cuVectorXK, _cuVectorJX);
+-      float njx =
+-          (float)ComputeVectorNorm(_cuVectorJX, __num_cpu_thread[FUNC_VS]);
+-      expected_reduction = 2.0f * dxtg - njx;
+-
+-      // could the expected reduction be negative??? not sure
+-      if (expected_reduction <= 0)
+-        expected_reduction = 0.001f * residual_reduction;
+-    } else if (__lm_use_diagonal_damp) {
+-      float dq = (float)ComputeVectorNormW(_cuVectorXK, _cuVectorJJ);
+-      expected_reduction = damping * dq + dxtg;
+-    } else {
+-      expected_reduction = damping * dx_sqnorm + dxtg;
+-    }
+-    /// save the new motion/struture
+-    _cuCameraData.swap(_cuCameraDataEX);
+-    _cuPointData.swap(_cuPointDataEX);
+-  }
+-  ////////////////////////////////////////////
+-  return float(residual_reduction / expected_reduction);
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::AdjustBundleAdjsutmentMode() {
+-  if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
+-    _cuJacobianPoint.resize(0);
+-  } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
+-    _cuJacobianCamera.resize(0);
+-    _cuJacobianCameraT.resize(0);
+-  }
+-}
+-
+-template <class Float>
+-float SparseBundleCPU<Float>::EvaluateDeltaNorm() {
+-  if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
+-    VectorF temp;
+-    temp.set(_cuVectorXK.data(), 8 * _num_camera);
+-    return (float)ComputeVectorNorm(temp);
+-  } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
+-    VectorF temp;
+-    temp.set(_cuVectorXK.data() + 8 * _num_camera, POINT_ALIGN * _num_point);
+-    return (float)ComputeVectorNorm(temp);
+-  } else {
+-    return (float)ComputeVectorNorm(_cuVectorXK);
+-  }
+-}
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::NonlinearOptimizeLM() {
+-  ////////////////////////////////////////
+-  TimerBA timer(this, TIMER_OPTIMIZATION);
+-
+-  ////////////////////////////////////////////////
+-  float mse_convert_ratio =
+-      1.0f / (_num_imgpt * __focal_scaling * __focal_scaling);
+-  float error_display_ratio = __verbose_sse ? _num_imgpt : 1.0f;
+-  const int edwidth = __verbose_sse ? 12 : 8;
+-  _projection_sse =
+-      EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
+-  __initial_mse = __final_mse = _projection_sse * mse_convert_ratio;
+-
+-  // compute jacobian diagonals for normalization
+-  if (__jacobian_normalize) PrepareJacobianNormalization();
+-
+-  // evalaute jacobian
+-  EvaluateJacobians();
+-  ComputeJtE(_cuImageProj, _cuVectorJtE);
+-  ///////////////////////////////////////////////////////////////
+-  if (__verbose_level)
+-    std::cout << "Initial " << (__verbose_sse ? "sumed" : "mean")
+-              << " squared error = " << __initial_mse * error_display_ratio
+-              << "\n----------------------------------------------\n";
+-
+-  //////////////////////////////////////////////////
+-  VectorF& cuImageTempProj = _cuVectorJX;
+-  // VectorF& cuVectorTempJX  =   _cuVectorJX;
+-  VectorF& cuVectorDX = _cuVectorSJ.size() ? _cuVectorZK : _cuVectorXK;
+-
+-  //////////////////////////////////////////////////
+-  float damping_adjust = 2.0f, damping = __lm_initial_damp, g_norm, g_inf;
+-  SaveBundleRecord(0, _projection_sse * mse_convert_ratio, damping, g_norm,
+-                   g_inf);
+-
+-  ////////////////////////////////////
+-  std::cout << std::left;
+-  for (int i = 0; i < __lm_max_iteration && !__abort_flag;
+-       __current_iteration = (++i)) {
+-    ////solve linear system
+-    int num_cg_iteration = SolveNormalEquation(damping);
+-
+-    // there must be NaN somewhere
+-    if (num_cg_iteration == 0) {
+-      if (__verbose_level)
+-        std::cout << "#" << std::setw(3) << i << " quit on numeric errors\n";
+-      __pba_return_code = 'E';
+-      break;
+-    }
+-
+-    // there must be infinity somewhere
+-    if (__recent_cg_status == 'I') {
+-      std::cout << "#" << std::setw(3) << i << " 0  I e=" << std::setw(edwidth)
+-                << "------- "
+-                << " u=" << std::setprecision(3) << std::setw(9) << damping
+-                << '\n' << std::setprecision(6);
+-      /////////////increase damping factor
+-      damping = damping * damping_adjust;
+-      damping_adjust = 2.0f * damping_adjust;
+-      --i;
+-      continue;
+-    }
+-
+-    /////////////////////
+-    ++__num_lm_iteration;
+-
+-    ////////////////////////////////////
+-    float dx_sqnorm = EvaluateDeltaNorm(), dx_norm = sqrt(dx_sqnorm);
+-
+-    // In this library, we check absolute difference instead of realtive
+-    // difference
+-    if (dx_norm <= __lm_delta_threshold) {
+-      // damping factor must be way too big...or it converges
+-      if (__verbose_level > 1)
+-        std::cout << "#" << std::setw(3) << i << " " << std::setw(3)
+-                  << num_cg_iteration << char(__recent_cg_status)
+-                  << " quit on too small change (" << dx_norm << "  < "
+-                  << __lm_delta_threshold << ")\n";
+-      __pba_return_code = 'S';
+-      break;
+-    }
+-    ///////////////////////////////////////////////////////////////////////
+-    // update structure and motion, check reprojection error
+-    float new_residual = UpdateCameraPoint(cuVectorDX, cuImageTempProj);
+-    float average_residual = new_residual * mse_convert_ratio;
+-    float residual_reduction = _projection_sse - new_residual;
+-
+-    // do we find a better solution?
+-    if (std::isfinite(new_residual) && residual_reduction > 0) {
+-      ////compute relative norm change
+-      float relative_reduction = 1.0f - (new_residual / _projection_sse);
+-
+-      ////////////////////////////////////
+-      __num_lm_success++;  // increase counter
+-      _projection_sse = new_residual;  // save the new residual
+-      _cuImageProj.swap(cuImageTempProj);  // save the new projection
+-
+-      ////////////////////compute gain ratio///////////
+-      float gain_ratio =
+-          SaveUpdatedSystem(residual_reduction, dx_sqnorm, damping);
+-
+-      ////////////////////////////////////////////////
+-      SaveBundleRecord(i + 1, _projection_sse * mse_convert_ratio, damping,
+-                       g_norm, g_inf);
+-
+-      /////////////////////////////////////////////
+-      if (__verbose_level > 1)
+-        std::cout << "#" << std::setw(3) << i << " " << std::setw(3)
+-                  << num_cg_iteration << char(__recent_cg_status)
+-                  << " e=" << std::setw(edwidth)
+-                  << average_residual * error_display_ratio
+-                  << " u=" << std::setprecision(3) << std::setw(9) << damping
+-                  << " r=" << std::setw(6)
+-                  << floor(gain_ratio * 1000.f) * 0.001f
+-                  << " g=" << std::setw(g_norm > 0 ? 9 : 1) << g_norm << " "
+-                  << std::setw(9) << relative_reduction << ' ' << std::setw(9)
+-                  << dx_norm << " t=" << int(BundleTimerGetNow()) << "\n"
+-                  << std::setprecision(6);
+-
+-      /////////////////////////////
+-      if (!IsTimeBudgetAvailable()) {
+-        if (__verbose_level > 1)
+-          std::cout << "#" << std::setw(3) << i << " used up time budget.\n";
+-        __pba_return_code = 'T';
+-        break;
+-      } else if (__lm_check_gradient && g_inf < __lm_gradient_threshold) {
+-        if (__verbose_level > 1)
+-          std::cout << "#" << std::setw(3) << i
+-                    << " converged with small gradient\n";
+-        __pba_return_code = 'G';
+-        break;
+-      } else if (average_residual * error_display_ratio <= __lm_mse_threshold) {
+-        if (__verbose_level > 1)
+-          std::cout << "#" << std::setw(3) << i << " satisfies MSE threshold\n";
+-        __pba_return_code = 'M';
+-        break;
+-      } else {
+-        /////////////////////////////adjust damping factor
+-        float temp = gain_ratio * 2.0f - 1.0f;
+-        float adaptive_adjust = 1.0f - temp * temp * temp;  // powf(, 3.0f); //
+-        float auto_adjust = std::max(1.0f / 3.0f, adaptive_adjust);
+-
+-        //////////////////////////////////////////////////
+-        damping = damping * auto_adjust;
+-        damping_adjust = 2.0f;
+-        if (damping < __lm_minimum_damp)
+-          damping = __lm_minimum_damp;
+-        else if (__lm_damping_auto_switch == 0 && damping > __lm_maximum_damp &&
+-                 __lm_use_diagonal_damp)
+-          damping = __lm_maximum_damp;
+-
+-        EvaluateJacobians();
+-        ComputeJtE(_cuImageProj, _cuVectorJtE);
+-      }
+-    } else {
+-      if (__verbose_level > 1)
+-        std::cout << "#" << std::setw(3) << i << " " << std::setw(3)
+-                  << num_cg_iteration << char(__recent_cg_status)
+-                  << " e=" << std::setw(edwidth) << std::left
+-                  << average_residual * error_display_ratio
+-                  << " u=" << std::setprecision(3) << std::setw(9) << damping
+-                  << " r=----- " << (__lm_check_gradient || __save_gradient_norm
+-                                         ? " g=---------"
+-                                         : " g=0")
+-                  << " --------- " << std::setw(9) << dx_norm
+-                  << " t=" << int(BundleTimerGetNow()) << "\n"
+-                  << std::setprecision(6);
+-
+-      if (__lm_damping_auto_switch > 0 && __lm_use_diagonal_damp &&
+-          damping > __lm_damping_auto_switch) {
+-        __lm_use_diagonal_damp = false;
+-        damping = __lm_damping_auto_switch;
+-        damping_adjust = 2.0f;
+-        if (__verbose_level > 1)
+-          std::cout << "NOTE: switch to damping with an identity matix\n";
+-      } else {
+-        /////////////increase damping factor
+-        damping = damping * damping_adjust;
+-        damping_adjust = 2.0f * damping_adjust;
+-      }
+-    }
+-
+-    if (__verbose_level == 1) std::cout << '.';
+-  }
+-
+-  __final_mse = float(_projection_sse * mse_convert_ratio);
+-  __final_mse_x =
+-      __use_radial_distortion
+-          ? EvaluateProjectionX(_cuCameraData, _cuPointData, _cuImageProj) *
+-                mse_convert_ratio
+-          : __final_mse;
+-}
+-
+-#define PROFILE_REPORT2(A, T) \
+-  std::cout << std::setw(24) << A << ": " << (T) << "\n";
+-
+-#define PROFILE_REPORT(A)                 \
+-  std::cout << std::setw(24) << A << ": " \
+-            << (BundleTimerGet(TIMER_PROFILE_STEP) / repeat) << "\n";
+-
+-#define PROFILE_(B)                     \
+-  BundleTimerStart(TIMER_PROFILE_STEP); \
+-  for (int i = 0; i < repeat; ++i) {    \
+-    B;                                  \
+-  }                                     \
+-  BundleTimerSwitch(TIMER_PROFILE_STEP);
+-
+-#define PROFILE(A, B) PROFILE_(A B) PROFILE_REPORT(#A)
+-#define PROXILE(A, B) PROFILE_(B) PROFILE_REPORT(A)
+-#define PROTILE(FID, A, B)                                   \
+-  {                                                          \
+-    float tbest = FLT_MAX;                                   \
+-    int nbest = 1;                                           \
+-    int nto = nthread[FID];                                  \
+-    {                                                        \
+-      std::ostringstream os1;                                \
+-      os1 << #A "(" << nto << ")";                           \
+-      PROXILE(os1.str(), A B);                               \
+-    }                                                        \
+-    for (int j = 1; j <= THREAD_NUM_MAX; j *= 2) {           \
+-      nthread[FID] = j;                                      \
+-      PROFILE_(A B);                                         \
+-      float t = BundleTimerGet(TIMER_PROFILE_STEP) / repeat; \
+-      if (t > tbest) {                                       \
+-        if (j >= max(nto, 16)) break;                        \
+-      } else {                                               \
+-        tbest = t;                                           \
+-        nbest = j;                                           \
+-      }                                                      \
+-    }                                                        \
+-    if (nto != 0) nthread[FID] = nbest;                      \
+-    {                                                        \
+-      std::ostringstream os;                                 \
+-      os << #A "(" << nbest << ")";                          \
+-      PROFILE_REPORT2(os.str(), tbest);                      \
+-    }                                                        \
+-  }
+-
+-#define PROTILE2(FID1, FID2, A, B)                           \
+-  {                                                          \
+-    int nt1 = nthread[FID1], nt2 = nthread[FID2];            \
+-    {                                                        \
+-      std::ostringstream os1;                                \
+-      os1 << #A "(" << nt1 << "," << nt2 << ")";             \
+-      PROXILE(os1.str(), A B);                               \
+-    }                                                        \
+-    float tbest = FLT_MAX;                                   \
+-    int nbest1 = 1, nbest2 = 1;                              \
+-    nthread[FID2] = 1;                                       \
+-    for (int j = 1; j <= THREAD_NUM_MAX; j *= 2) {           \
+-      nthread[FID1] = j;                                     \
+-      PROFILE_(A B);                                         \
+-      float t = BundleTimerGet(TIMER_PROFILE_STEP) / repeat; \
+-      if (t > tbest) {                                       \
+-        if (j >= max(nt1, 16)) break;                        \
+-      } else {                                               \
+-        tbest = t;                                           \
+-        nbest1 = j;                                          \
+-      }                                                      \
+-    }                                                        \
+-    nthread[FID1] = nbest1;                                  \
+-    for (int j = 2; j <= THREAD_NUM_MAX; j *= 2) {           \
+-      nthread[FID2] = j;                                     \
+-      PROFILE_(A B);                                         \
+-      float t = BundleTimerGet(TIMER_PROFILE_STEP) / repeat; \
+-      if (t > tbest) {                                       \
+-        if (j >= max(nt2, 16)) break;                        \
+-      } else {                                               \
+-        tbest = t;                                           \
+-        nbest2 = j;                                          \
+-      }                                                      \
+-    }                                                        \
+-    nthread[FID2] = nbest2;                                  \
+-    {                                                        \
+-      std::ostringstream os;                                 \
+-      os << #A "(" << nbest1 << "," << nbest2 << ")";        \
+-      PROFILE_REPORT2(os.str(), tbest);                      \
+-    }                                                        \
+-    if (nt1 == 0) nthread[FID1] = 0;                         \
+-    if (nt2 == 0) nthread[FID2] = 0;                         \
+-  }
+-
+-template <class Float>
+-void SparseBundleCPU<Float>::RunProfileSteps() {
+-  const int repeat = std::max(__profile_pba, 1);
+-  int* nthread = __num_cpu_thread;
+-  std::cout << "---------------------------------\n"
+-               "|    Run profiling steps ("
+-            << repeat << ")  |\n"
+-                         "---------------------------------\n"
+-            << std::left;
+-  ;
+-
+-  ///////////////////////////////////////////////
+-  EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
+-  if (__jacobian_normalize) PrepareJacobianNormalization();
+-  EvaluateJacobians();
+-  ComputeJtE(_cuImageProj, _cuVectorJtE);
+-  ComputeBlockPC(__lm_initial_damp, true);
+-  ///////////////////////////////
+-  do {
+-    if (SolveNormalEquationPCGX(__lm_initial_damp) == 10 &&
+-        SolveNormalEquationPCGB(__lm_initial_damp) == 10)
+-      break;
+-    __lm_initial_damp *= 2.0f;
+-  } while (__lm_initial_damp < 1024.0f);
+-  std::cout << "damping set to " << __lm_initial_damp << " for profiling\n"
+-            << "---------------------------------\n";
+-  ///////////////////////
+-  {
+-    int repeat = 10, cgmin = __cg_min_iteration, cgmax = __cg_max_iteration;
+-    __cg_max_iteration = __cg_min_iteration = 10;
+-    __num_cg_iteration = 0;
+-    PROFILE(SolveNormalEquationPCGX, (__lm_initial_damp));
+-    if (__num_cg_iteration != 100)
+-      std::cout << __num_cg_iteration << " cg iterations in all\n";
+-    //////////////////////////////////////////////////////
+-    __num_cg_iteration = 0;
+-    PROFILE(SolveNormalEquationPCGB, (__lm_initial_damp));
+-    if (__num_cg_iteration != 100)
+-      std::cout << __num_cg_iteration << " cg iterations in all\n";
+-    std::cout << "---------------------------------\n";
+-    //////////////////////////////////////////////////////
+-    __num_cg_iteration = 0;
+-    PROXILE("Single iteration LMX", RunTestIterationLM(true));
+-    if (__num_cg_iteration != 100)
+-      std::cout << __num_cg_iteration << " cg iterations in all\n";
+-    //////////////////////////////////////////////////////
+-    __num_cg_iteration = 0;
+-    PROXILE("Single iteration LMB", RunTestIterationLM(false));
+-    if (__num_cg_iteration != 100)
+-      std::cout << __num_cg_iteration << " cg iterations in all\n";
+-    std::cout << "---------------------------------\n";
+-    __cg_max_iteration = cgmax;
+-    __cg_min_iteration = cgmin;
+-  }
+-
+-  /////////////////////////////////////////////////////
+-  PROFILE(UpdateCameraPoint, (_cuVectorZK, _cuImageProj));
+-  PROFILE(ComputeVectorNorm, (_cuVectorXK));
+-  PROFILE(ComputeVectorDot, (_cuVectorXK, _cuVectorRK));
+-  PROFILE(ComputeVectorNormW, (_cuVectorXK, _cuVectorRK));
+-  PROFILE(ComputeSAXPY, ((Float)0.01f, _cuVectorXK, _cuVectorRK, _cuVectorZK));
+-  PROFILE(ComputeSXYPZ,
+-          ((Float)0.01f, _cuVectorXK, _cuVectorPK, _cuVectorRK, _cuVectorZK));
+-  std::cout << "---------------------------------\n";
+-  PROTILE(FUNC_VS, ComputeVectorNorm,
+-          (_cuImageProj, nthread[FUNC_VS]));  // reset the parameter to 0
+-
+-  ///////////////////////////////////////
+-  {
+-    avec<Float> temp1(_cuImageProj.size()), temp2(_cuImageProj.size());
+-    SetVectorZero(temp1);
+-    PROTILE(FUNC_VV, ComputeSAXPY,
+-            ((Float)0.01f, _cuImageProj, temp1, temp2, nthread[FUNC_VV]));
+-  }
+-
+-  std::cout << "---------------------------------\n";
+-  __multiply_jx_usenoj = false;
+-
+-  ////////////////////////////////////////////////////
+-  PROTILE(FUNC_PJ, EvaluateProjection,
+-          (_cuCameraData, _cuPointData, _cuImageProj));
+-  PROTILE2(FUNC_MPC, FUNC_MPP, ApplyBlockPC, (_cuVectorJtE, _cuVectorPK));
+-
+-  /////////////////////////////////////////////////
+-  if (!__no_jacobian_store) {
+-    if (__jc_store_original) {
+-      PROTILE(FUNC_JX, ComputeJX, (_cuVectorJtE, _cuVectorJX));
+-
+-      if (__jc_store_transpose) {
+-        PROTILE(FUNC_JJ_JCO_JCT_JP, EvaluateJacobians, ());
+-        PROTILE2(FUNC_JTEC_JCT, FUNC_JTEP, ComputeJtE,
+-                 (_cuImageProj, _cuVectorJtE));
+-        PROTILE2(FUNC_BCC_JCT, FUNC_BCP, ComputeBlockPC, (0.001f, true));
+-        PROFILE(ComputeDiagonal, (_cuVectorPK));
+-
+-        std::cout << "---------------------------------\n"
+-                     "|   Not storing original  JC    | \n"
+-                     "---------------------------------\n";
+-        __jc_store_original = false;
+-        PROTILE(FUNC_JJ_JCT_JP, EvaluateJacobians, ());
+-        __jc_store_original = true;
+-      }
+-
+-      //////////////////////////////////////////////////
+-      std::cout << "---------------------------------\n"
+-                   "|   Not storing transpose JC    | \n"
+-                   "---------------------------------\n";
+-      __jc_store_transpose = false;
+-      _cuJacobianCameraT.resize(0);
+-      PROTILE(FUNC_JJ_JCO_JP, EvaluateJacobians, ());
+-      PROTILE2(FUNC_JTEC_JCO, FUNC_JTEP, ComputeJtE,
+-               (_cuImageProj, _cuVectorJtE));
+-      PROTILE2(FUNC_BCC_JCO, FUNC_BCP, ComputeBlockPC, (0.001f, true));
+-      PROFILE(ComputeDiagonal, (_cuVectorPK));
+-    } else if (__jc_store_transpose) {
+-      PROTILE2(FUNC_JTEC_JCT, FUNC_JTEP, ComputeJtE,
+-               (_cuImageProj, _cuVectorJtE));
+-      PROTILE2(FUNC_BCC_JCT, FUNC_BCP, ComputeBlockPC, (0.001f, true));
+-      PROFILE(ComputeDiagonal, (_cuVectorPK));
+-
+-      std::cout << "---------------------------------\n"
+-                   "|   Not storing original  JC    | \n"
+-                   "---------------------------------\n";
+-      PROTILE(FUNC_JJ_JCT_JP, EvaluateJacobians, ());
+-    }
+-  }
+-
+-  if (!__no_jacobian_store) {
+-    std::cout << "---------------------------------\n"
+-                 "| Not storing Camera Jacobians  | \n"
+-                 "---------------------------------\n";
+-    __jc_store_transpose = false;
+-    __jc_store_original = false;
+-    _cuJacobianCamera.resize(0);
+-    _cuJacobianCameraT.resize(0);
+-    PROTILE(FUNC_JJ_JP, EvaluateJacobians, ());
+-    PROTILE(FUNC_JTE_, ComputeJtE, (_cuImageProj, _cuVectorJtE));
+-    // PROFILE(ComputeBlockPC, (0.001f, true));
+-  }
+-
+-  ///////////////////////////////////////////////
+-  std::cout << "---------------------------------\n"
+-               "|   Not storing any jacobians   |\n"
+-               "---------------------------------\n";
+-  __no_jacobian_store = true;
+-  _cuJacobianPoint.resize(0);
+-  PROTILE(FUNC_JX_, ComputeJX, (_cuVectorJtE, _cuVectorJX));
+-  PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE));
+-  PROFILE(ComputeBlockPC, (0.001f, true));
+-  std::cout << "---------------------------------\n";
+-}
+-
+-template <class Float>
+-int SparseBundleCPU<Float>::FindProcessorCoreNum() {
+-#ifdef _WIN32
+-#if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_FAMILY_APP
+-  SYSTEM_INFO sysinfo;
+-  GetNativeSystemInfo(&sysinfo);
+-#else
+-  SYSTEM_INFO sysinfo;
+-  GetSystemInfo(&sysinfo);
+-#endif
+-  return sysinfo.dwNumberOfProcessors;
+-#else
+-  return sysconf(_SC_NPROCESSORS_ONLN);
+-#endif
+-}
+-
+-ParallelBA* NewSparseBundleCPU(bool dp, const int num_threads) {
+-#ifndef SIMD_NO_DOUBLE
+-  if (dp)
+-    return new SparseBundleCPU<double>(num_threads);
+-  else
+-#endif
+-    return new SparseBundleCPU<float>(num_threads);
+-}
+-
+-}  // namespace pba
+diff --git a/lib/PBA/SparseBundleCPU.h b/lib/PBA/SparseBundleCPU.h
+deleted file mode 100644
+index 73beb9e10..000000000
+--- a/lib/PBA/SparseBundleCPU.h
++++ /dev/null
+@@ -1,286 +0,0 @@
+-////////////////////////////////////////////////////////////////////////////
+-//  File:       SparseBundleCPU.h
+-//  Author:       Changchang Wu (ccwu@cs.washington.edu)
+-//  Description :   interface of the CPU-version of multi-core bundle adjustment
+-//
+-//  Copyright (c) 2011  Changchang Wu (ccwu@cs.washington.edu)
+-//    and the University of Washington at Seattle
+-//
+-//  This library is free software; you can redistribute it and/or
+-//  modify it under the terms of the GNU General Public
+-//  License as published by the Free Software Foundation; either
+-//  Version 3 of the License, or (at your option) any later version.
+-//
+-//  This library is distributed in the hope that it will be useful,
+-//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+-//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-//  General Public License for more details.
+-//
+-////////////////////////////////////////////////////////////////////////////////
+-
+-#if !defined(SPARSE_BUNDLE_CPU_H)
+-#define SPARSE_BUNDLE_CPU_H
+-
+-// BYTE-ALIGNMENT for data allocation (16 required for SSE, 32 required for AVX)
+-// PREVIOUS version uses only SSE. The new version will include AVX.
+-// SO the alignment is increased from 16 to 32
+-#define VECTOR_ALIGNMENT 32
+-#define FLOAT_ALIGN 8
+-#define VECTOR_ALIGNMENT_MASK (VECTOR_ALIGNMENT - 1)
+-#define ALIGN_PTR(p) \
+-  ((((size_t)p) + VECTOR_ALIGNMENT_MASK) & (~VECTOR_ALIGNMENT_MASK))
+-
+-namespace pba {
+-
+-template <class Float>
+-class avec {
+-  bool _owner;
+-  Float* _data;
+-  Float* _last;
+-  size_t _size;
+-  size_t _capacity;
+-
+- public:
+-  static Float* allocate(size_t count) {
+-    size_t size = count * sizeof(Float);
+-#ifdef _MSC_VER
+-    Float* p = (Float*)_aligned_malloc(size, VECTOR_ALIGNMENT);
+-    if (p == NULL) throw std::bad_alloc();
+-    return p;
+-#else
+-    char* p = (char*)malloc(size + VECTOR_ALIGNMENT + 4);
+-    if (p == NULL) throw std::bad_alloc();
+-    char* p1 = p + 1;
+-    char* p2 =
+-        (char*)ALIGN_PTR(p1);  //(char*) (((((size_t)p1) + 15) >> 4) << 4);
+-    char* p3 = (p2 - 1);
+-    p3[0] = (p2 - p);
+-    return (Float*)p2;
+-#endif
+-  }
+-  static void deallocate(void* p) {
+-#ifdef _MSC_VER
+-    _aligned_free(p);
+-#else
+-    char* p3 = ((char*)p) - 1;
+-    free(((char*)p) - p3[0]);
+-#endif
+-  }
+-
+- public:
+-  avec() {
+-    _owner = true;
+-    _last = _data = NULL;
+-    _size = _capacity = 0;
+-  }
+-  avec(size_t count) {
+-    _data = allocate(count);
+-    _size = _capacity = count;
+-    _last = _data + count;
+-    _owner = true;
+-  }
+-  ~avec() {
+-    if (_data && _owner) deallocate(_data);
+-  }
+-
+-  inline void resize(size_t newcount) {
+-    if (!_owner) {
+-      _data = _last = NULL;
+-      _capacity = _size = 0;
+-      _owner = true;
+-    }
+-    if (newcount <= _capacity) {
+-      _size = newcount;
+-      _last = _data + newcount;
+-    } else {
+-      if (_data && _owner) deallocate(_data);
+-      _data = allocate(newcount);
+-      _size = _capacity = newcount;
+-      _last = _data + newcount;
+-    }
+-  }
+-
+-  inline void set(Float* data, size_t count) {
+-    if (_data && _owner) deallocate(_data);
+-    _data = data;
+-    _owner = false;
+-    _size = count;
+-    _last = _data + _size;
+-    _capacity = count;
+-  }
+-  inline void swap(avec<Float>& next) {
+-    bool _owner_bak = _owner;
+-    Float* _data_bak = _data;
+-    Float* _last_bak = _last;
+-    size_t _size_bak = _size;
+-    size_t _capa_bak = _capacity;
+-
+-    _owner = next._owner;
+-    _data = next._data;
+-    _last = next._last;
+-    _size = next._size;
+-    _capacity = next._capacity;
+-
+-    next._owner = _owner_bak;
+-    next._data = _data_bak;
+-    next._last = _last_bak;
+-    next._size = _size_bak;
+-    next._capacity = _capa_bak;
+-  }
+-
+-  inline operator Float*() { return _size ? _data : NULL; }
+-  inline operator Float* const() const { return _data; }
+-  inline Float* begin() { return _size ? _data : NULL; }
+-  inline Float* data() { return _size ? _data : NULL; }
+-  inline Float* end() { return _last; }
+-  inline const Float* begin() const { return _size ? _data : NULL; }
+-  inline const Float* end() const { return _last; }
+-  inline size_t size() const { return _size; }
+-  inline size_t IsValid() const { return _size; }
+-  void SaveToFile(const char* name);
+-};
+-
+-template <class Float>
+-class SparseBundleCPU : public ParallelBA, public ConfigBA {
+- public:
+-  SparseBundleCPU(const int num_threads);
+-
+-  typedef avec<Float> VectorF;
+-  typedef std::vector<int> VectorI;
+-  typedef float float_t;
+-
+- protected:  // cpu data
+-  int _num_camera;
+-  int _num_point;
+-  int _num_imgpt;
+-  CameraT* _camera_data;
+-  float* _point_data;
+-
+-  ////////////////////////////////
+-  const float* _imgpt_data;
+-  const int* _camera_idx;
+-  const int* _point_idx;
+-  const int* _focal_mask;
+-
+-  ///////////sumed square error
+-  float _projection_sse;
+-
+- protected:  // cuda data
+-  VectorF _cuCameraData;
+-  VectorF _cuCameraDataEX;
+-  VectorF _cuPointData;
+-  VectorF _cuPointDataEX;
+-  VectorF _cuMeasurements;
+-  VectorF _cuImageProj;
+-  VectorF _cuJacobianCamera;
+-  VectorF _cuJacobianPoint;
+-  VectorF _cuJacobianCameraT;
+-  VectorI _cuProjectionMap;
+-  VectorI _cuPointMeasurementMap;
+-  VectorI _cuCameraMeasurementMap;
+-  VectorI _cuCameraMeasurementList;
+-  VectorI _cuCameraMeasurementListT;
+-
+-  //////////////////////////
+-  VectorF _cuBlockPC;
+-  VectorF _cuVectorSJ;
+-
+-  /// LM normal    equation
+-  VectorF _cuVectorJtE;
+-  VectorF _cuVectorJJ;
+-  VectorF _cuVectorJX;
+-  VectorF _cuVectorXK;
+-  VectorF _cuVectorPK;
+-  VectorF _cuVectorZK;
+-  VectorF _cuVectorRK;
+-
+-  //////////////////////////////////
+- protected:
+-  int _num_imgpt_q;
+-  float _weight_q;
+-  VectorI _cuCameraQList;
+-  VectorI _cuCameraQMap;
+-  VectorF _cuCameraQMapW;
+-  VectorF _cuCameraQListW;
+-
+- protected:
+-  bool ProcessIndexCameraQ(std::vector<int>& qmap, std::vector<int>& qlist);
+-  void ProcessWeightCameraQ(std::vector<int>& cpnum, std::vector<int>& qmap,
+-                            Float* qmapw, Float* qlistw);
+-
+- protected:  // internal functions
+-  int ValidateInputData();
+-  int InitializeBundle();
+-  int GetParameterLength();
+-  void BundleAdjustment();
+-  void NormalizeData();
+-  void TransferDataToHost();
+-  void DenormalizeData();
+-  void NormalizeDataF();
+-  void NormalizeDataD();
+-  bool InitializeStorageForSFM();
+-  bool InitializeStorageForCG();
+-
+-  void SaveBundleRecord(int iter, float res, float damping, float& g_norm,
+-                        float& g_inf);
+-
+- protected:
+-  void PrepareJacobianNormalization();
+-  void EvaluateJacobians();
+-  void ComputeJtE(VectorF& E, VectorF& JtE, int mode = 0);
+-  void ComputeJX(VectorF& X, VectorF& JX, int mode = 0);
+-  void ComputeDiagonal(VectorF& JJI);
+-  void ComputeBlockPC(float lambda, bool dampd);
+-  void ApplyBlockPC(VectorF& v, VectorF& pv, int mode = 0);
+-  float UpdateCameraPoint(VectorF& dx, VectorF& cuImageTempProj);
+-  float EvaluateProjection(VectorF& cam, VectorF& point, VectorF& proj);
+-  float EvaluateProjectionX(VectorF& cam, VectorF& point, VectorF& proj);
+-  float SaveUpdatedSystem(float residual_reduction, float dx_sqnorm,
+-                          float damping);
+-  float EvaluateDeltaNorm();
+-  int SolveNormalEquationPCGB(float lambda);
+-  int SolveNormalEquationPCGX(float lambda);
+-  int SolveNormalEquation(float lambda);
+-  void NonlinearOptimizeLM();
+-  void AdjustBundleAdjsutmentMode();
+-  void RunProfileSteps();
+-  void RunTestIterationLM(bool reduced);
+-  void DumpCooJacobian();
+-
+- private:
+-  static int FindProcessorCoreNum();
+-
+- public:
+-  virtual void AbortBundleAdjustment() { __abort_flag = true; }
+-  virtual int GetCurrentIteration() { return __current_iteration; }
+-  virtual void SetNextTimeBudget(int seconds) {
+-    __bundle_time_budget = seconds;
+-  }
+-  virtual void SetNextBundleMode(BundleModeT mode) {
+-    __bundle_mode_next = mode;
+-  }
+-  virtual void SetFixedIntrinsics(bool fixed) { __fixed_intrinsics = fixed; }
+-  virtual void EnableRadialDistortion(DistortionT type) {
+-    __use_radial_distortion = type;
+-  }
+-  virtual void ParseParam(int narg, char** argv) {
+-    ConfigBA::ParseParam(narg, argv);
+-  }
+-  virtual ConfigBA* GetInternalConfig() { return this; }
+-
+- public:
+-  SparseBundleCPU();
+-  virtual void SetCameraData(size_t ncam, CameraT* cams);
+-  virtual void SetPointData(size_t npoint, Point3D* pts);
+-  virtual void SetProjection(size_t nproj, const Point2D* imgpts,
+-                             const int* point_idx, const int* cam_idx);
+-  virtual void SetFocalMask(const int* fmask, float weight);
+-  virtual float GetMeanSquaredError();
+-  virtual int RunBundleAdjustment();
+-};
+-
+-ParallelBA* NewSparseBundleCPU(bool dp, const int num_threads);
+-
+-}  // namespace pba
+-
+-#endif
+diff --git a/lib/PBA/SparseBundleCU.cpp b/lib/PBA/SparseBundleCU.cpp
+deleted file mode 100644
+index 95929971f..000000000
+--- a/lib/PBA/SparseBundleCU.cpp
++++ /dev/null
+@@ -1,1989 +0,0 @@
+-////////////////////////////////////////////////////////////////////////////
+-//  File:           SparseBundleCU.cpp
+-//  Author:         Changchang Wu
+-//  Description :   implementation of the CUDA-based multicore bundle adjustment
+-//
+-//  Copyright (c) 2011  Changchang Wu (ccwu@cs.washington.edu)
+-//    and the University of Washington at Seattle
+-//
+-//  This library is free software; you can redistribute it and/or
+-//  modify it under the terms of the GNU General Public
+-//  License as published by the Free Software Foundation; either
+-//  Version 3 of the License, or (at your option) any later version.
+-//
+-//  This library is distributed in the hope that it will be useful,
+-//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+-//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-//  General Public License for more details.
+-//
+-////////////////////////////////////////////////////////////////////////////////
+-
+-#include <vector>
+-#include <iostream>
+-#include <utility>
+-#include <algorithm>
+-#include <fstream>
+-#include <iomanip>
+-using std::vector;
+-using std::cout;
+-using std::pair;
+-using std::ofstream;
+-
+-#include <stdlib.h>
+-#include <math.h>
+-#include <float.h>
+-#include "pba.h"
+-#include "SparseBundleCU.h"
+-
+-#include "ProgramCU.h"
+-
+-using namespace pba::ProgramCU;
+-
+-#ifdef _WIN32
+-#define finite _finite
+-#endif
+-
+-namespace pba {
+-
+-typedef float float_t;  // data type for host computation; double doesn't make
+-                        // much difference
+-
+-#define CHECK_VEC(v1, v2)                                                 \
+-  for (size_t j = 0; j < v1.size(); ++j) {                                \
+-    if (v1[j] != v2[j]) {                                                 \
+-      different++;                                                        \
+-      std::cout << i << ' ' << j << ' ' << v1[j] << ' ' << v2[j] << '\n'; \
+-    }                                                                     \
+-  }
+-#define DEBUG_FUNCN(v, func, input, N)                                  \
+-  if (__debug_pba && v.IsValid()) {                                     \
+-    vector<float> buf(v.GetLength()), buf_(v.GetLength());              \
+-    for (int i = 0; i < N; ++i) {                                       \
+-      int different = 0;                                                \
+-      func input;                                                       \
+-      ProgramCU::FinishWorkCUDA();                                      \
+-      if (i > 0) {                                                      \
+-        v.CopyToHost(&buf_[0]);                                         \
+-        CHECK_VEC(buf, buf_);                                           \
+-      } else {                                                          \
+-        v.CopyToHost(&buf[0]);                                          \
+-      }                                                                 \
+-      if (different != 0)                                               \
+-        std::cout << #func << " : " << i << " : " << different << '\n'; \
+-    }                                                                   \
+-  }
+-#define DEBUG_FUNC(v, func, input) DEBUG_FUNCN(v, func, input, 2)
+-
+-SparseBundleCU::SparseBundleCU(int device)
+-    : ParallelBA(PBA_INVALID_DEVICE),
+-      _num_camera(0),
+-      _num_point(0),
+-      _num_imgpt(0),
+-      _num_imgpt_q(0),
+-      _camera_data(NULL),
+-      _point_data(NULL),
+-      _imgpt_data(NULL),
+-      _camera_idx(NULL),
+-      _point_idx(NULL),
+-      _projection_sse(0) {
+-  __selected_device = device;
+-}
+-
+-size_t SparseBundleCU::GetMemCapacity() {
+-  if (__selected_device != __current_device) SetCudaDevice(__selected_device);
+-  size_t sz = ProgramCU::GetCudaMemoryCap();
+-  if (sz < 1024) std::cout << "ERROR: CUDA is unlikely to be supported!\n";
+-  return sz < 1024 ? 0 : sz;
+-}
+-
+-void SparseBundleCU::SetCameraData(size_t ncam, CameraT* cams) {
+-  if (sizeof(CameraT) != 16 * sizeof(float)) exit(0);  // never gonna happen...?
+-  _num_camera = (int)ncam;
+-  _camera_data = cams;
+-  _focal_mask = NULL;
+-}
+-
+-void SparseBundleCU::SetFocalMask(const int* fmask, float weight) {
+-  _focal_mask = fmask;
+-  _weight_q = weight;
+-}
+-
+-void SparseBundleCU::SetPointData(size_t npoint, Point3D* pts) {
+-  _num_point = (int)npoint;
+-  _point_data = (float*)pts;
+-}
+-
+-void SparseBundleCU::SetProjection(size_t nproj, const Point2D* imgpts,
+-                                   const int* point_idx, const int* cam_idx) {
+-  _num_imgpt = (int)nproj;
+-  _imgpt_data = (float*)imgpts;
+-  _camera_idx = cam_idx;
+-  _point_idx = point_idx;
+-  _imgpt_datax.resize(0);
+-}
+-
+-float SparseBundleCU::GetMeanSquaredError() {
+-  return float(_projection_sse /
+-               (_num_imgpt * __focal_scaling * __focal_scaling));
+-}
+-
+-void SparseBundleCU::BundleAdjustment() {
+-  if (ValidateInputData() != STATUS_SUCCESS) return;
+-
+-  //
+-
+-  ////////////////////////
+-  TimerBA timer(this, TIMER_OVERALL);
+-
+-  NormalizeData();
+-  if (InitializeBundle() != STATUS_SUCCESS) {
+-    // failed to allocate gpu storage
+-  } else if (__profile_pba) {
+-    // profiling some stuff
+-    RunProfileSteps();
+-  } else {
+-    // real optimization
+-    AdjustBundleAdjsutmentMode();
+-    NonlinearOptimizeLM();
+-    TransferDataToHost();
+-  }
+-  DenormalizeData();
+-}
+-
+-int SparseBundleCU::RunBundleAdjustment() {
+-  if (__warmup_device) WarmupDevice();
+-  ResetBundleStatistics();
+-  BundleAdjustment();
+-  if (__num_lm_success > 0)
+-    SaveBundleStatistics(_num_camera, _num_point, _num_imgpt);
+-  if (__num_lm_success > 0) PrintBundleStatistics();
+-  ResetTemporarySetting();
+-  return __num_lm_success;
+-}
+-
+-bool SparseBundleCU::InitializeBundleGPU() {
+-  bool previous_allocated = __memory_usage > 0;
+-
+-  bool success = TransferDataToGPU() && InitializeStorageForCG();
+-  if (!success && previous_allocated) {
+-    if (__verbose_level) std::cout << "WARNING: try clean allocation\n";
+-    ClearPreviousError();
+-    ReleaseAllocatedData();
+-    success = TransferDataToGPU() && InitializeStorageForCG();
+-  }
+-
+-  if (!success && __jc_store_original) {
+-    if (__verbose_level) std::cout << "WARNING: try not storing original JC\n";
+-    __jc_store_original = false;
+-    ClearPreviousError();
+-    ReleaseAllocatedData();
+-    success = TransferDataToGPU() && InitializeStorageForCG();
+-  }
+-  if (!success && __jc_store_transpose) {
+-    if (__verbose_level) std::cout << "WARNING: try not storing transpose JC\n";
+-    __jc_store_transpose = false;
+-    ClearPreviousError();
+-    ReleaseAllocatedData();
+-    success = TransferDataToGPU() && InitializeStorageForCG();
+-  }
+-  if (!success && !__no_jacobian_store) {
+-    if (__verbose_level) std::cout << "WARNING: switch to memory saving mode\n";
+-    __no_jacobian_store = true;
+-    ClearPreviousError();
+-    ReleaseAllocatedData();
+-    success = TransferDataToGPU() && InitializeStorageForCG();
+-  }
+-  return success;
+-}
+-
+-int SparseBundleCU::ValidateInputData() {
+-  if (_camera_data == NULL) return STATUS_CAMERA_MISSING;
+-  if (_point_data == NULL) return STATUS_POINT_MISSING;
+-  if (_imgpt_data == NULL) return STATUS_MEASURMENT_MISSING;
+-  if (_camera_idx == NULL || _point_idx == NULL)
+-    return STATUS_PROJECTION_MISSING;
+-  return STATUS_SUCCESS;
+-}
+-
+-void SparseBundleCU::WarmupDevice() {
+-  std::cout << "Warm up device with storage allocation...\n";
+-  if (__selected_device != __current_device) SetCudaDevice(__selected_device);
+-  CheckRequiredMemX();
+-  InitializeBundleGPU();
+-}
+-
+-int SparseBundleCU::InitializeBundle() {
+-  /////////////////////////////////////////////////////
+-  TimerBA timer(this, TIMER_GPU_ALLOCATION);
+-  if (__selected_device != __current_device) SetCudaDevice(__selected_device);
+-  CheckRequiredMemX();
+-  ReserveStorageAuto();
+-  if (!InitializeBundleGPU()) return STATUS_ALLOCATION_FAIL;
+-  return STATUS_SUCCESS;
+-}
+-
+-int SparseBundleCU::GetParameterLength() {
+-  return _num_camera * 8 + 4 * _num_point;
+-}
+-
+-bool SparseBundleCU::CheckRequiredMemX() {
+-  if (CheckRequiredMem(0)) return true;
+-  if (__jc_store_original) {
+-    if (__verbose_level) std::cout << "NOTE: not storing original JC\n";
+-    __jc_store_original = false;
+-    if (CheckRequiredMem(1)) return true;
+-  }
+-  if (__jc_store_transpose) {
+-    if (__verbose_level) std::cout << "NOTE:  not storing camera Jacobian\n";
+-    __jc_store_transpose = false;
+-    if (CheckRequiredMem(1)) return true;
+-  }
+-  if (!__no_jacobian_store) {
+-    if (__verbose_level) std::cout << "NOTE: not storing any Jacobian\n";
+-    __no_jacobian_store = true;
+-    if (CheckRequiredMem(1)) return true;
+-  }
+-  return false;
+-}
+-
+-bool SparseBundleCU::CheckRequiredMem(int fresh) {
+-  int m = _num_camera, n = _num_point, k = _num_imgpt;
+-#ifdef PBA_CUDA_ALLOCATE_MORE
+-  if (!fresh) {
+-    int m0 = _cuCameraData.GetReservedWidth();
+-    m = std::max(m, m0);
+-    int n0 = _cuPointData.GetReservedWidth();
+-    n = std::max(n, n0);
+-    int k0 = _cuMeasurements.GetReservedWidth();
+-    k = std::max(k, k0);
+-  }
+-#endif
+-
+-  int p = 8 * m + 4 * n, q = _num_imgpt_q;
+-  size_t szn, total = GetCudaMemoryCap();
+-  size_t sz0 = 800 * 600 * 2 * 4 * sizeof(float);  //
+-  size_t szq = q > 0 ? (sizeof(float) * (q + m) * 4) : 0;
+-  size_t sz = sizeof(float) * (258 + 9 * n + 33 * m + 7 * k) + sz0;
+-
+-  /////////////////////////////////// CG
+-  sz += p * 6 * sizeof(float);
+-  sz += ((__use_radial_distortion ? 64 : 56) * m + 12 * n) * sizeof(float);
+-  sz += (2 * (k + q) * sizeof(float));
+-  if (sz > total) return false;
+-
+-  /////////////////////////////////////
+-  szn = (__no_jacobian_store ? 0 : (sizeof(float) * 8 * k));
+-  if (sz + szn > total)
+-    __no_jacobian_store = false;
+-  else
+-    sz += szn;
+-  /////////////////////////////
+-  szn = ((!__no_jacobian_store && __jc_store_transpose) ? 16 * k * sizeof(float)
+-                                                        : 0);
+-  if (sz + szn > total)
+-    __jc_store_transpose = false;
+-  else
+-    sz += szn;
+-  ///////////////////////////
+-  szn = ((!__no_jacobian_store && __jc_store_original) ? 16 * k * sizeof(float)
+-                                                       : 0);
+-  if (sz + szn > total)
+-    __jc_store_original = false;
+-  else
+-    sz += szn;
+-  ///////////////////////////////
+-  szn = ((!__no_jacobian_store && __jc_store_transpose && !__jc_store_original)
+-             ? k * sizeof(int)
+-             : 0);
+-  if (sz + szn > total) {
+-    __jc_store_transpose = false;
+-    sz -= (16 * k * sizeof(float));
+-  } else
+-    sz += szn;
+-
+-  return sz <= total;
+-}
+-
+-void SparseBundleCU::ReserveStorage(size_t ncam, size_t npt, size_t nproj) {
+-  if (ncam <= 1 || npt <= 1 || nproj <= 1) {
+-    ReleaseAllocatedData();
+-    // Reset the memory strategy to the default.
+-    __jc_store_transpose = true;
+-    __jc_store_original = true;
+-    __no_jacobian_store = false;
+-  } else {
+-    const int* camidx = _camera_idx;
+-    const int* ptidx = _point_idx;
+-    int ncam_ = _num_camera;
+-    int npt_ = _num_point;
+-    int nproj_ = _num_imgpt;
+-
+-#ifdef PBA_CUDA_ALLOCATE_MORE
+-    size_t ncam_reserved = _cuCameraData.GetReservedWidth();
+-    size_t npt_reserved = _cuPointData.GetReservedWidth();
+-    size_t nproj_reserved = _cuMeasurements.GetReservedWidth();
+-    ncam = std::max(ncam, ncam_reserved);
+-    npt = std::max(npt, npt_reserved);
+-    nproj = std::max(nproj, nproj_reserved);
+-#endif
+-
+-    _camera_idx = NULL;
+-    _point_idx = NULL;
+-    _num_camera = (int)ncam;
+-    _num_point = (int)npt;
+-    _num_imgpt = (int)nproj;
+-
+-    if (__verbose_level)
+-      std::cout << "Reserving storage for ncam = " << ncam << "; npt = " << npt
+-                << "; nproj = " << nproj << '\n';
+-    InitializeBundleGPU();
+-
+-    _num_camera = ncam_;
+-    _num_point = npt_;
+-    _num_imgpt = nproj_;
+-    _camera_idx = camidx;
+-    _point_idx = ptidx;
+-  }
+-}
+-
+-static size_t upgrade_dimension(size_t sz) {
+-  size_t x = 1;
+-  while (x < sz) x <<= 1;
+-  return x;
+-}
+-
+-void SparseBundleCU::ReserveStorageAuto() {
+-  if (_cuCameraData.data() == NULL || _cuPointData.data() == NULL ||
+-      _cuMeasurements.data() == NULL)
+-    return;
+-  ReserveStorage(upgrade_dimension(_num_camera), upgrade_dimension(_num_point),
+-                 upgrade_dimension(_num_imgpt));
+-}
+-
+-#define REPORT_ALLOCATION(NAME)                                   \
+-  if (__verbose_allocation && NAME.GetDataSize() > 1024)          \
+-    std::cout << (NAME.GetDataSize() > 1024 * 1024                \
+-                      ? NAME.GetDataSize() / 1024 / 1024          \
+-                      : NAME.GetDataSize() / 1024)                \
+-              << (NAME.GetDataSize() > 1024 * 1024 ? "MB" : "KB") \
+-              << "\t allocated for " #NAME "\n";
+-
+-#define ASSERT_ALLOCATION(NAME)                                    \
+-  if (!success) {                                                  \
+-    std::cerr << "WARNING: failed to allocate "                    \
+-              << (__verbose_allocation ? #NAME "; size = " : "")   \
+-              << (total_sz / 1024 / 1024) << "MB + "               \
+-              << (NAME.GetRequiredSize() / 1024 / 1024) << "MB\n"; \
+-    return false;                                                  \
+-  } else {                                                         \
+-    total_sz += NAME.GetDataSize();                                \
+-    REPORT_ALLOCATION(NAME);                                       \
+-  }
+-
+-#define CHECK_ALLOCATION(NAME)                                     \
+-  if (NAME.GetDataSize() == 0 && NAME.GetRequiredSize() > 0) {     \
+-    ClearPreviousError();                                          \
+-    std::cerr << "WARNING: unable to allocate " #NAME ": "         \
+-              << (NAME.GetRequiredSize() / 1024 / 1024) << "MB\n"; \
+-  } else {                                                         \
+-    total_sz += NAME.GetDataSize();                                \
+-    REPORT_ALLOCATION(NAME);                                       \
+-  }
+-
+-#define ALLOCATE_REQUIRED_DATA(NAME, num, channels) \
+-  {                                                 \
+-    success &= NAME.InitTexture(num, 1, channels);  \
+-    ASSERT_ALLOCATION(NAME);                        \
+-  }
+-
+-#define ALLOCATE_OPTIONAL_DATA(NAME, num, channels, option) \
+-  if (option) {                                             \
+-    option = NAME.InitTexture(num, 1, channels);            \
+-    CHECK_ALLOCATION(NAME);                                 \
+-  } else {                                                  \
+-    NAME.InitTexture(0, 0, 0);                              \
+-  }
+-
+-bool SparseBundleCU::TransferDataToGPU() {
+-  // given m camera, npoint, k measurements.. the number of float is
+-  bool success = true;
+-  size_t total_sz = 0;
+-
+-  /////////////////////////////////////////////////////////////////////////////
+-  vector<int> qmap, qlist;
+-  vector<float> qmapw, qlistw;
+-  ProcessIndexCameraQ(qmap, qlist);
+-
+-  //////////////////////////////////////////////////////////////////////////////
+-  ALLOCATE_REQUIRED_DATA(_cuBufferData, 256, 1);  // 256
+-  ALLOCATE_REQUIRED_DATA(_cuPointData, _num_point, 4);  // 4n
+-  ALLOCATE_REQUIRED_DATA(_cuCameraData, _num_camera, 16);  // 16m
+-  ALLOCATE_REQUIRED_DATA(_cuCameraDataEX, _num_camera, 16);  // 16m
+-
+-  ////////////////////////////////////////////////////////////////
+-  ALLOCATE_REQUIRED_DATA(_cuCameraMeasurementMap, _num_camera + 1, 1);  // m
+-  ALLOCATE_REQUIRED_DATA(_cuCameraMeasurementList, _num_imgpt, 1);  // k
+-  ALLOCATE_REQUIRED_DATA(_cuPointMeasurementMap, _num_point + 1, 1);  // n
+-  ALLOCATE_REQUIRED_DATA(_cuProjectionMap, _num_imgpt, 2);  // 2k
+-  ALLOCATE_REQUIRED_DATA(_cuImageProj, _num_imgpt + _num_imgpt_q, 2);  // 2k
+-  ALLOCATE_REQUIRED_DATA(_cuPointDataEX, _num_point, 4);  // 4n
+-  ALLOCATE_REQUIRED_DATA(_cuMeasurements, _num_imgpt, 2);  // 2k
+-
+-  //
+-  ALLOCATE_REQUIRED_DATA(_cuCameraQMap, _num_imgpt_q, 2);
+-  ALLOCATE_REQUIRED_DATA(_cuCameraQMapW, _num_imgpt_q, 2);
+-  ALLOCATE_REQUIRED_DATA(_cuCameraQList, (_num_imgpt_q > 0 ? _num_camera : 0),
+-                         2);
+-  ALLOCATE_REQUIRED_DATA(_cuCameraQListW, (_num_imgpt_q > 0 ? _num_camera : 0),
+-                         2);
+-
+-  if (__no_jacobian_store) {
+-    _cuJacobianCamera.ReleaseData();
+-    _cuJacobianCameraT.ReleaseData();
+-    _cuJacobianPoint.ReleaseData();
+-    _cuCameraMeasurementListT.ReleaseData();
+-  } else {
+-    ALLOCATE_REQUIRED_DATA(_cuJacobianPoint, _num_imgpt * 2, 4);  // 8k
+-    ALLOCATE_OPTIONAL_DATA(_cuJacobianCameraT, _num_imgpt * 2, 8,
+-                           __jc_store_transpose);  // 16k
+-    ALLOCATE_OPTIONAL_DATA(_cuJacobianCamera, _num_imgpt * 2, 8,
+-                           __jc_store_original);  // 16k
+-
+-    if ((!__jc_store_original || __profile_pba) && __jc_store_transpose) {
+-      ALLOCATE_OPTIONAL_DATA(_cuCameraMeasurementListT, _num_imgpt, 1,
+-                             __jc_store_transpose);  // k
+-      if (!__jc_store_transpose) _cuJacobianCameraT.ReleaseData();
+-    } else {
+-      _cuCameraMeasurementListT.ReleaseData();
+-    }
+-  }
+-
+-  /////////////////////////////////////////////////
+-  if (_camera_idx && _point_idx) {
+-    //////////////////////////////////////////
+-    BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION);
+-    ////mapping from camera to measuremnts
+-    vector<int> cpi(_num_camera + 1), cpidx(_num_imgpt);
+-    vector<int> cpnum(_num_camera, 0);
+-    cpi[0] = 0;
+-    for (int i = 0; i < _num_imgpt; ++i) cpnum[_camera_idx[i]]++;
+-    for (int i = 1; i <= _num_camera; ++i) cpi[i] = cpi[i - 1] + cpnum[i - 1];
+-    vector<int> cptidx = cpi;
+-    for (int i = 0; i < _num_imgpt; ++i) cpidx[cptidx[_camera_idx[i]]++] = i;
+-    if (_num_imgpt_q > 0) ProcessWeightCameraQ(cpnum, qmap, qmapw, qlistw);
+-    BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION);
+-
+-    ///////////////////////////////////////////////////////////////////////////////
+-    BundleTimerSwap(TIMER_GPU_UPLOAD, TIMER_GPU_ALLOCATION);
+-    _cuMeasurements.CopyFromHost(_imgpt_datax.size() > 0 ? &_imgpt_datax[0]
+-                                                         : _imgpt_data);
+-    _cuCameraData.CopyFromHost(_camera_data);
+-    _cuPointData.CopyFromHost(_point_data);
+-    _cuCameraMeasurementMap.CopyFromHost(&cpi[0]);
+-    _cuCameraMeasurementList.CopyFromHost(&cpidx[0]);
+-    if (_cuCameraMeasurementListT.IsValid()) {
+-      vector<int> ridx(_num_imgpt);
+-      for (int i = 0; i < _num_imgpt; ++i) ridx[cpidx[i]] = i;
+-      _cuCameraMeasurementListT.CopyFromHost(&ridx[0]);
+-    }
+-    if (_num_imgpt_q > 0) {
+-      _cuCameraQMap.CopyFromHost(&qmap[0]);
+-      _cuCameraQMapW.CopyFromHost(&qmapw[0]);
+-      _cuCameraQList.CopyFromHost(&qlist[0]);
+-      _cuCameraQListW.CopyFromHost(&qlistw[0]);
+-    }
+-    BundleTimerSwap(TIMER_GPU_UPLOAD, TIMER_GPU_ALLOCATION);
+-
+-    ////////////////////////////////////////////
+-    ///////mapping from point to measurment
+-    BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION);
+-    vector<int> ppi(_num_point + 1);
+-    for (int i = 0, last_point = -1; i < _num_imgpt; ++i) {
+-      int pt = _point_idx[i];
+-      while (last_point < pt) ppi[++last_point] = i;
+-    }
+-    ppi[_num_point] = _num_imgpt;
+-
+-    //////////projection map
+-    vector<int> projection_map(_num_imgpt * 2);
+-    for (int i = 0; i < _num_imgpt; ++i) {
+-      int* imp = &projection_map[i * 2];
+-      imp[0] = _camera_idx[i] * 2;
+-      imp[1] = _point_idx[i];
+-    }
+-    BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION);
+-
+-    //////////////////////////////////////////////////////////////
+-    BundleTimerSwap(TIMER_GPU_UPLOAD, TIMER_GPU_ALLOCATION);
+-    _cuPointMeasurementMap.CopyFromHost(&ppi[0]);
+-    _cuProjectionMap.CopyFromHost(&projection_map[0]);
+-    BundleTimerSwap(TIMER_GPU_UPLOAD, TIMER_GPU_ALLOCATION);
+-  }
+-
+-  __memory_usage = total_sz;
+-  if (__verbose_level > 1)
+-    std::cout << "Memory for Motion/Structure/Jacobian:\t"
+-              << (total_sz / 1024 / 1024) << "MB\n";
+-  return success;
+-}
+-
+-bool SparseBundleCU::ProcessIndexCameraQ(vector<int>& qmap,
+-                                         vector<int>& qlist) {
+-  // reset q-data
+-  qmap.resize(0);
+-  qlist.resize(0);
+-  _num_imgpt_q = 0;
+-
+-  // verify input
+-  if (_camera_idx == NULL) return true;
+-  if (_point_idx == NULL) return true;
+-  if (_focal_mask == NULL) return true;
+-  if (_num_camera == 0) return true;
+-  if (_weight_q <= 0) return true;
+-
+-  ///////////////////////////////////////
+-
+-  int error = 0;
+-  vector<int> temp(_num_camera * 2, -1);
+-
+-  for (int i = 0; i < _num_camera; ++i) {
+-    int iq = _focal_mask[i];
+-    if (iq > i) {
+-      error = 1;
+-      break;
+-    }
+-    if (iq < 0) continue;
+-    if (iq == i) continue;
+-    int ip = temp[2 * iq];
+-    // float ratio = _camera_data[i].f / _camera_data[iq].f;
+-    // if(ratio < 0.01 || ratio > 100)
+-    //{
+-    //  std::cout << "Warning: constaraints on largely different camreas\n";
+-    //  continue;
+-    //}else
+-    if (_focal_mask[iq] != iq) {
+-      error = 1;
+-      break;
+-    } else if (ip == -1) {
+-      temp[2 * iq] = i;
+-      temp[2 * iq + 1] = i;
+-      temp[2 * i] = iq;
+-      temp[2 * i + 1] = iq;
+-    } else {
+-      // maintain double-linked list
+-      temp[2 * i] = ip;
+-      temp[2 * i + 1] = iq;
+-      temp[2 * ip + 1] = i;
+-      temp[2 * iq] = i;
+-    }
+-  }
+-
+-  if (error) {
+-    std::cout << "Error: incorrect constraints\n";
+-    _focal_mask = NULL;
+-    return false;
+-  }
+-
+-  qlist.resize(_num_camera * 2, -1);
+-  for (int i = 0; i < _num_camera; ++i) {
+-    int inext = temp[2 * i + 1];
+-    if (inext == -1) continue;
+-    qlist[2 * i] = _num_imgpt + _num_imgpt_q;
+-    qlist[2 * inext + 1] = _num_imgpt + _num_imgpt_q;
+-    qmap.push_back(i);
+-    qmap.push_back(inext);
+-    _num_imgpt_q++;
+-  }
+-  return true;
+-}
+-
+-void SparseBundleCU::ProcessWeightCameraQ(vector<int>& cpnum, vector<int>& qmap,
+-                                          vector<float>& qmapw,
+-                                          vector<float>& qlistw) {
+-  // set average focal length and average radial distortion
+-  vector<float> qpnum(_num_camera, 0), qcnum(_num_camera, 0);
+-  vector<float> fs(_num_camera, 0), rs(_num_camera, 0);
+-
+-  for (int i = 0; i < _num_camera; ++i) {
+-    int qi = _focal_mask[i];
+-    if (qi == -1) continue;
+-    // float ratio = _camera_data[i].f / _camera_data[qi].f;
+-    // if(ratio < 0.01 || ratio > 100)      continue;
+-    fs[qi] += _camera_data[i].f;
+-    rs[qi] += _camera_data[i].radial;
+-    qpnum[qi] += cpnum[i];
+-    qcnum[qi] += 1.0f;
+-  }
+-
+-  // this seems not really matter..they will converge anyway
+-  for (int i = 0; i < _num_camera; ++i) {
+-    int qi = _focal_mask[i];
+-    if (qi == -1) continue;
+-    // float ratio = _camera_data[i].f / _camera_data[qi].f;
+-    // if(ratio < 0.01 || ratio > 100)      continue;
+-    _camera_data[i].f = fs[qi] / qcnum[qi];
+-    _camera_data[i].radial = rs[qi] / qcnum[qi];
+-  }
+-
+-  qmapw.resize(_num_imgpt_q * 2, 0);
+-  qlistw.resize(_num_camera * 2, 0);
+-  for (int i = 0; i < _num_imgpt_q; ++i) {
+-    int cidx = qmap[i * 2], qi = _focal_mask[cidx];
+-    float wi = sqrt(qpnum[qi] / qcnum[qi]) * _weight_q;
+-    float wr = (__use_radial_distortion ? wi * _camera_data[qi].f : 0.0);
+-    qmapw[i * 2] = wi;
+-    qmapw[i * 2 + 1] = wr;
+-    qlistw[cidx * 2] = wi;
+-    qlistw[cidx * 2 + 1] = wr;
+-  }
+-}
+-
+-void SparseBundleCU::ReleaseAllocatedData() {
+-  _cuCameraData.ReleaseData();
+-  _cuCameraDataEX.ReleaseData();
+-  _cuPointData.ReleaseData();
+-  _cuPointDataEX.ReleaseData();
+-  _cuMeasurements.ReleaseData();
+-  _cuImageProj.ReleaseData();
+-  _cuJacobianCamera.ReleaseData();
+-  _cuJacobianPoint.ReleaseData();
+-  _cuJacobianCameraT.ReleaseData();
+-  _cuProjectionMap.ReleaseData();
+-  _cuPointMeasurementMap.ReleaseData();
+-  _cuCameraMeasurementMap.ReleaseData();
+-  _cuCameraMeasurementList.ReleaseData();
+-  _cuCameraMeasurementListT.ReleaseData();
+-  _cuBufferData.ReleaseData();
+-  _cuBlockPC.ReleaseData();
+-  _cuVectorJtE.ReleaseData();
+-  _cuVectorJJ.ReleaseData();
+-  _cuVectorJX.ReleaseData();
+-  _cuVectorXK.ReleaseData();
+-  _cuVectorPK.ReleaseData();
+-  _cuVectorZK.ReleaseData();
+-  _cuVectorRK.ReleaseData();
+-  _cuVectorSJ.ReleaseData();
+-  _cuCameraQList.ReleaseData();
+-  _cuCameraQMap.ReleaseData();
+-  _cuCameraQMapW.ReleaseData();
+-  _cuCameraQListW.ReleaseData();
+-  ProgramCU::ResetCurrentDevice();
+-}
+-
+-void SparseBundleCU::NormalizeDataF() {
+-  int incompatible_radial_distortion = 0;
+-  if (__focal_normalize) {
+-    if (__focal_scaling == 1.0f) {
+-      //------------------------------------------------------------------
+-      //////////////////////////////////////////////////////////////
+-      vector<float> focals(_num_camera);
+-      for (int i = 0; i < _num_camera; ++i) focals[i] = _camera_data[i].f;
+-      std::nth_element(focals.begin(), focals.begin() + _num_camera / 2,
+-                       focals.end());
+-      float median_focal_length = focals[_num_camera / 2];
+-      __focal_scaling = __data_normalize_median / median_focal_length;
+-      float radial_factor = median_focal_length * median_focal_length * 4.0f;
+-
+-      ///////////////////////////////
+-      _imgpt_datax.resize(_num_imgpt * 2);
+-      for (int i = 0; i < _num_imgpt * 2; ++i)
+-        _imgpt_datax[i] = _imgpt_data[i] * __focal_scaling;
+-      for (int i = 0; i < _num_camera; ++i) {
+-        _camera_data[i].f *= __focal_scaling;
+-        if (!__use_radial_distortion) {
+-        } else if (__reset_initial_distortion) {
+-          _camera_data[i].radial = 0;
+-        } else if (_camera_data[i].distortion_type != __use_radial_distortion) {
+-          incompatible_radial_distortion++;
+-          _camera_data[i].radial = 0;
+-        } else if (__use_radial_distortion == -1) {
+-          _camera_data[i].radial *= radial_factor;
+-        }
+-      }
+-      if (__verbose_level > 2)
+-        std::cout << "Focal length normalized by " << __focal_scaling << '\n';
+-      __reset_initial_distortion = false;
+-    }
+-  } else {
+-    if (__use_radial_distortion) {
+-      for (int i = 0; i < _num_camera; ++i) {
+-        if (__reset_initial_distortion) {
+-          _camera_data[i].radial = 0;
+-        } else if (_camera_data[i].distortion_type != __use_radial_distortion) {
+-          _camera_data[i].radial = 0;
+-          incompatible_radial_distortion++;
+-        }
+-      }
+-      __reset_initial_distortion = false;
+-    }
+-    _imgpt_datax.resize(0);
+-  }
+-
+-  if (incompatible_radial_distortion) {
+-    std::cout << "ERROR: incompatible radial distortion input; reset to 0;\n";
+-  }
+-}
+-
+-void SparseBundleCU::NormalizeDataD() {
+-  if (__depth_scaling == 1.0f) {
+-    const float dist_bound = 1.0f;
+-    vector<float> oz(_num_imgpt);
+-    vector<float> cpdist1(_num_camera, dist_bound);
+-    vector<float> cpdist2(_num_camera, -dist_bound);
+-    vector<int> camnpj(_num_camera, 0), cambpj(_num_camera, 0);
+-    int bad_point_count = 0;
+-    for (int i = 0; i < _num_imgpt; ++i) {
+-      int cmidx = _camera_idx[i];
+-      CameraT* cam = _camera_data + cmidx;
+-      float* rz = cam->m[2];
+-      float* x = _point_data + 4 * _point_idx[i];
+-      oz[i] = (rz[0] * x[0] + rz[1] * x[1] + rz[2] * x[2] + cam->t[2]);
+-
+-      /////////////////////////////////////////////////
+-      // points behind camera may causes big problem
+-      float ozr = oz[i] / cam->t[2];
+-      if (fabs(ozr) < __depth_check_epsilon) {
+-        bad_point_count++;
+-        float px = cam->f * (cam->m[0][0] * x[0] + cam->m[0][1] * x[1] +
+-                             cam->m[0][2] * x[2] + cam->t[0]);
+-        float py = cam->f * (cam->m[1][0] * x[0] + cam->m[1][1] * x[1] +
+-                             cam->m[1][2] * x[2] + cam->t[1]);
+-        float mx = _imgpt_data[i * 2], my = _imgpt_data[2 * i + 1];
+-        bool checkx = fabs(mx) > fabs(my);
+-        if ((checkx && px * oz[i] * mx < 0 && fabs(mx) > 64) ||
+-            (!checkx && py * oz[i] * my < 0 && fabs(my) > 64)) {
+-          if (__verbose_level > 3)
+-            std::cout << "Warning: proj of #" << cmidx
+-                      << " on the wrong side, oz = " << oz[i] << " ("
+-                      << (px / oz[i]) << ',' << (py / oz[i]) << ") (" << mx
+-                      << ',' << my << ")\n";
+-          /////////////////////////////////////////////////////////////////////////
+-          if (oz[i] > 0)
+-            cpdist2[cmidx] = 0;
+-          else
+-            cpdist1[cmidx] = 0;
+-        }
+-        if (oz[i] >= 0)
+-          cpdist1[cmidx] = std::min(cpdist1[cmidx], oz[i]);
+-        else
+-          cpdist2[cmidx] = std::max(cpdist2[cmidx], oz[i]);
+-      }
+-      if (oz[i] < 0) {
+-        __num_point_behind++;
+-        cambpj[cmidx]++;
+-      }
+-      camnpj[cmidx]++;
+-    }
+-    if (bad_point_count > 0 && __depth_degeneracy_fix) {
+-      if (!__focal_normalize || !__depth_normalize)
+-        std::cout << "Enable data normalization on degeneracy\n";
+-      __focal_normalize = true;
+-      __depth_normalize = true;
+-    }
+-    if (__depth_normalize) {
+-      std::nth_element(oz.begin(), oz.begin() + _num_imgpt / 2, oz.end());
+-      float oz_median = oz[_num_imgpt / 2];
+-      float shift_min = std::min(oz_median * 0.001f, 1.0f);
+-      float dist_threshold = shift_min * 0.1f;
+-      __depth_scaling = (1.0f / oz_median) / __data_normalize_median;
+-      if (__verbose_level > 2)
+-        std::cout << "Depth normalized by " << __depth_scaling << " ("
+-                  << oz_median << ")\n";
+-
+-      for (int i = 0; i < _num_camera; ++i) {
+-        // move the camera a little bit?
+-        if (!__depth_degeneracy_fix) {
+-        } else if ((cpdist1[i] < dist_threshold ||
+-                    cpdist2[i] > -dist_threshold)) {
+-          float shift = shift_min;  //(cpdist1[i] <= -cpdist2[i] ? shift_min :
+-                                    //-shift_min);
+-          // if(cpdist1[i] < dist_bound && cpdist2[i] > - dist_bound) shift = -
+-          // 0.5f * (cpdist1[i] + cpdist2[i]);
+-          bool boths =
+-              cpdist1[i] < dist_threshold && cpdist2[i] > -dist_threshold;
+-          _camera_data[i].t[2] += shift;
+-          if (__verbose_level > 3)
+-            std::cout << "Adjust C" << std::setw(5) << i << " by "
+-                      << std::setw(12) << shift << " [B" << std::setw(2)
+-                      << cambpj[i] << "/" << std::setw(5) << camnpj[i] << "] ["
+-                      << (boths ? 'X' : ' ') << "][" << cpdist1[i] << ", "
+-                      << cpdist2[i] << "]\n";
+-          __num_camera_modified++;
+-        }
+-        _camera_data[i].t[0] *= __depth_scaling;
+-        _camera_data[i].t[1] *= __depth_scaling;
+-        _camera_data[i].t[2] *= __depth_scaling;
+-      }
+-      for (int i = 0; i < _num_point; ++i) {
+-        /////////////////////////////////
+-        _point_data[4 * i + 0] *= __depth_scaling;
+-        _point_data[4 * i + 1] *= __depth_scaling;
+-        _point_data[4 * i + 2] *= __depth_scaling;
+-      }
+-    }
+-    if (__num_point_behind > 0)
+-      std::cout << "WARNING: " << __num_point_behind
+-                << " points are behind cameras.\n";
+-    if (__num_camera_modified > 0)
+-      std::cout << "WARNING: " << __num_camera_modified
+-                << " camera moved to avoid degeneracy.\n";
+-  }
+-}
+-
+-void SparseBundleCU::NormalizeData() {
+-  TimerBA timer(this, TIMER_PREPROCESSING);
+-  NormalizeDataD();
+-  NormalizeDataF();
+-}
+-
+-void SparseBundleCU::DenormalizeData() {
+-  if (__focal_normalize && __focal_scaling != 1.0f) {
+-    float squared_focal_factor = (__focal_scaling * __focal_scaling);
+-    for (int i = 0; i < _num_camera; ++i) {
+-      _camera_data[i].f /= __focal_scaling;
+-      if (__use_radial_distortion == -1)
+-        _camera_data[i].radial *= squared_focal_factor;
+-      _camera_data[i].distortion_type = __use_radial_distortion;
+-    }
+-    _projection_sse /= squared_focal_factor;
+-    __focal_scaling = 1.0f;
+-    _imgpt_datax.resize(0);
+-  } else if (__use_radial_distortion) {
+-    for (int i = 0; i < _num_camera; ++i)
+-      _camera_data[i].distortion_type = __use_radial_distortion;
+-  }
+-
+-  if (__depth_normalize && __depth_scaling != 1.0f) {
+-    for (int i = 0; i < _num_camera; ++i) {
+-      _camera_data[i].t[0] /= __depth_scaling;
+-      _camera_data[i].t[1] /= __depth_scaling;
+-      _camera_data[i].t[2] /= __depth_scaling;
+-    }
+-    for (int i = 0; i < _num_point; ++i) {
+-      _point_data[4 * i + 0] /= __depth_scaling;
+-      _point_data[4 * i + 1] /= __depth_scaling;
+-      _point_data[4 * i + 2] /= __depth_scaling;
+-    }
+-    __depth_scaling = 1.0f;
+-  }
+-}
+-
+-void SparseBundleCU::TransferDataToHost() {
+-  TimerBA timer(this, TIMER_GPU_DOWNLOAD);
+-  _cuCameraData.CopyToHost(_camera_data);
+-  _cuPointData.CopyToHost(_point_data);
+-}
+-
+-float SparseBundleCU::EvaluateProjection(CuTexImage& cam, CuTexImage& point,
+-                                         CuTexImage& proj) {
+-  ++__num_projection_eval;
+-  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_PJ, true);
+-  ComputeProjection(cam, point, _cuMeasurements, _cuProjectionMap, proj,
+-                    __use_radial_distortion);
+-  if (_num_imgpt_q > 0)
+-    ComputeProjectionQ(cam, _cuCameraQMap, _cuCameraQMapW, proj, _num_imgpt);
+-  return (float)ComputeVectorNorm(proj, _cuBufferData);
+-}
+-
+-float SparseBundleCU::EvaluateProjectionX(CuTexImage& cam, CuTexImage& point,
+-                                          CuTexImage& proj) {
+-  ++__num_projection_eval;
+-  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_PJ, true);
+-  ComputeProjectionX(cam, point, _cuMeasurements, _cuProjectionMap, proj,
+-                     __use_radial_distortion);
+-  if (_num_imgpt_q > 0)
+-    ComputeProjectionQ(cam, _cuCameraQMap, _cuCameraQMapW, proj, _num_imgpt);
+-  return (float)ComputeVectorNorm(proj, _cuBufferData);
+-}
+-
+-void SparseBundleCU::DebugProjections() {
+-  double e1 = 0, e2 = 0;
+-  for (int i = 0; i < _num_imgpt; ++i) {
+-    float* c = (float*)(_camera_data + _camera_idx[i]);
+-    float* p = _point_data + 4 * _point_idx[i];
+-    const float* m = _imgpt_datax.size() > 0 ? (&_imgpt_datax[i * 2])
+-                                             : (_imgpt_data + 2 * i);
+-    float* r = c + 4;
+-    float* t = c + 1;
+-    float dx1, dy1;
+-    ////////////////////////////////////////////////////////////////////////////////
+-    float z = r[6] * p[0] + r[7] * p[1] + r[8] * p[2] + t[2];
+-    float xx = (r[0] * p[0] + r[1] * p[1] + r[2] * p[2] + t[0]);
+-    float yy = (r[3] * p[0] + r[4] * p[1] + r[5] * p[2] + t[1]);
+-    float x = xx / z;
+-    float y = yy / z;
+-    if (__use_radial_distortion == -1) {
+-      float rn = (m[0] * m[0] + m[1] * m[1]) * c[13] + 1.0f;
+-      dx1 = c[0] * x - m[0] * rn;
+-      dy1 = c[0] * y - m[1] * rn;
+-      e1 += (dx1 * dx1 + dy1 * dy1);
+-      e2 += (dx1 * dx1 + dy1 * dy1) / (rn * rn);
+-    } else if (__use_radial_distortion) {
+-      float rn = (x * x + y * y) * c[13] + 1.0f;
+-      dx1 = c[0] * x * rn - m[0];
+-      dy1 = c[0] * y * rn - m[1];
+-      e1 += (dx1 * dx1 + dy1 * dy1) / (rn * rn);
+-      e2 += (dx1 * dx1 + dy1 * dy1);
+-    } else {
+-      dx1 = c[0] * x - m[0];
+-      dy1 = c[0] * y - m[1];
+-      e1 += (dx1 * dx1 + dy1 * dy1);
+-      e2 += (dx1 * dx1 + dy1 * dy1);
+-    }
+-    if (!isfinite(dx1) || !isfinite(dy1))
+-      std::cout << "x = " << xx << " y = " << yy << " z = " << z << '\n'
+-                << "t0 = " << t[0] << " t1 = " << t[1] << " t2 = " << t[2]
+-                << '\n' << "p0 = " << p[0] << " p1 = " << p[1]
+-                << " p2 = " << p[2] << '\n';
+-  }
+-  e1 = e1 / (__focal_scaling * __focal_scaling) / _num_imgpt;
+-  e2 = e2 / (__focal_scaling * __focal_scaling) / _num_imgpt;
+-  std::cout << "DEBUG: mean squared error = " << e1
+-            << " in undistorted domain;\n";
+-  std::cout << "DEBUG: mean squared error = " << e2
+-            << " in distorted domain.\n";
+-}
+-
+-bool SparseBundleCU::InitializeStorageForCG() {
+-  bool success = true;
+-  size_t total_sz = 0;
+-  int plen = GetParameterLength();  // q = 8m + 4n
+-
+-  //////////////////////////////////////////// 6q
+-  ALLOCATE_REQUIRED_DATA(_cuVectorJtE, plen, 1);
+-  ALLOCATE_REQUIRED_DATA(_cuVectorXK, plen, 1);
+-  ALLOCATE_REQUIRED_DATA(_cuVectorPK, plen, 1);
+-  ALLOCATE_REQUIRED_DATA(_cuVectorRK, plen, 1);
+-  ALLOCATE_REQUIRED_DATA(_cuVectorJJ, plen, 1);
+-  ALLOCATE_REQUIRED_DATA(_cuVectorZK, plen, 1);
+-
+-  /////////////////////////////////
+-  unsigned int cblock_len = (__use_radial_distortion ? 64 : 56);
+-  ALLOCATE_REQUIRED_DATA(_cuBlockPC, _num_camera * cblock_len + 12 * _num_point,
+-                         1);  // 64m + 12n
+-  if (__accurate_gain_ratio) {
+-    ALLOCATE_REQUIRED_DATA(_cuVectorJX, _num_imgpt + _num_imgpt_q, 2);  // 2k
+-  } else {
+-    _cuVectorJX.SetTexture(_cuImageProj.data(), _num_imgpt + _num_imgpt_q, 2);
+-  }
+-  ALLOCATE_OPTIONAL_DATA(_cuVectorSJ, plen, 1, __jacobian_normalize);
+-
+-  /////////////////////////////////////////
+-  __memory_usage += total_sz;
+-  if (__verbose_level > 1)
+-    std::cout << "Memory for Conjugate Gradient Solver:\t"
+-              << (total_sz / 1024 / 1024) << "MB\n";
+-  return success;
+-}
+-
+-void SparseBundleCU::PrepareJacobianNormalization() {
+-  if (!_cuVectorSJ.IsValid()) return;
+-
+-  if ((__jc_store_transpose || __jc_store_original) &&
+-      _cuJacobianPoint.IsValid() && !__bundle_current_mode) {
+-    CuTexImage null;
+-    null.SwapData(_cuVectorSJ);
+-    EvaluateJacobians();
+-    null.SwapData(_cuVectorSJ);
+-    ComputeDiagonal(_cuVectorJJ, _cuVectorSJ);
+-    ComputeSQRT(_cuVectorSJ);
+-  } else {
+-    CuTexImage null;
+-    null.SwapData(_cuVectorSJ);
+-    EvaluateJacobians();
+-    ComputeBlockPC(0, true);
+-    null.SwapData(_cuVectorSJ);
+-    _cuVectorJJ.SwapData(_cuVectorSJ);
+-    ProgramCU::ComputeRSQRT(_cuVectorSJ);
+-  }
+-}
+-
+-void SparseBundleCU::EvaluateJacobians(bool shuffle) {
+-  if (__no_jacobian_store) return;
+-  if (__bundle_current_mode == BUNDLE_ONLY_MOTION && !__jc_store_original &&
+-      !__jc_store_transpose)
+-    return;
+-  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JJ, true);
+-
+-  if (__jc_store_original || !__jc_store_transpose) {
+-    ComputeJacobian(_cuCameraData, _cuPointData, _cuJacobianCamera,
+-                    _cuJacobianPoint, _cuProjectionMap, _cuVectorSJ,
+-                    _cuMeasurements, _cuCameraMeasurementList,
+-                    __fixed_intrinsics, __use_radial_distortion, false);
+-    if (shuffle && __jc_store_transpose && _cuJacobianCameraT.IsValid())
+-      ShuffleCameraJacobian(_cuJacobianCamera, _cuCameraMeasurementList,
+-                            _cuJacobianCameraT);
+-  } else {
+-    ComputeJacobian(_cuCameraData, _cuPointData, _cuJacobianCameraT,
+-                    _cuJacobianPoint, _cuProjectionMap, _cuVectorSJ,
+-                    _cuMeasurements, _cuCameraMeasurementListT,
+-                    __fixed_intrinsics, __use_radial_distortion, true);
+-  }
+-  ++__num_jacobian_eval;
+-}
+-
+-void SparseBundleCU::ComputeJtE(CuTexImage& E, CuTexImage& JtE, int mode) {
+-  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JTE, true);
+-  if (mode == 0) mode = __bundle_current_mode;
+-  if (__no_jacobian_store || (!__jc_store_original && !__jc_store_transpose)) {
+-    ProgramCU::ComputeJtE_(E, JtE, _cuCameraData, _cuPointData, _cuMeasurements,
+-                           _cuCameraMeasurementMap, _cuCameraMeasurementList,
+-                           _cuPointMeasurementMap, _cuProjectionMap,
+-                           _cuJacobianPoint, __fixed_intrinsics,
+-                           __use_radial_distortion, mode);
+-
+-    ////////////////////////////////////////////////////////////////////////////////////
+-    if (!_cuVectorSJ.IsValid()) {
+-    } else if (mode == 2) {
+-      if (!_cuJacobianPoint.IsValid())
+-        ComputeVXY(JtE, _cuVectorSJ, JtE, _num_point * 4, _num_camera * 8);
+-    } else if (mode == 1)
+-      ComputeVXY(JtE, _cuVectorSJ, JtE, _num_camera * 8);
+-    else
+-      ComputeVXY(JtE, _cuVectorSJ, JtE,
+-                 _cuJacobianPoint.IsValid() ? _num_camera * 8 : 0);
+-
+-  } else if (__jc_store_transpose) {
+-    ProgramCU::ComputeJtE(E, _cuJacobianCameraT, _cuCameraMeasurementMap,
+-                          _cuCameraMeasurementList, _cuJacobianPoint,
+-                          _cuPointMeasurementMap, JtE, true, mode);
+-  } else {
+-    ProgramCU::ComputeJtE(E, _cuJacobianCamera, _cuCameraMeasurementMap,
+-                          _cuCameraMeasurementList, _cuJacobianPoint,
+-                          _cuPointMeasurementMap, JtE, false, mode);
+-  }
+-
+-  if (mode != 2 && _num_imgpt_q > 0)
+-    ProgramCU::ComputeJQtEC(E, _cuCameraQList, _cuCameraQListW, _cuVectorSJ,
+-                            JtE);
+-}
+-
+-void SparseBundleCU::SaveBundleRecord(int iter, float res, float damping,
+-                                      float& g_norm, float& g_inf) {
+-  // do not really compute if parameter not specified...
+-  // for large dataset, it never converges..
+-  g_inf =
+-      __lm_check_gradient ? ComputeVectorMax(_cuVectorJtE, _cuBufferData) : 0;
+-  g_norm = __save_gradient_norm
+-               ? float(ComputeVectorNorm(_cuVectorJtE, _cuBufferData))
+-               : g_inf;
+-  ConfigBA::SaveBundleRecord(iter, res, damping, g_norm, g_inf);
+-}
+-
+-void SparseBundleCU::ComputeJX(CuTexImage& X, CuTexImage& JX, int mode) {
+-  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JX, true);
+-  if (__no_jacobian_store || (__multiply_jx_usenoj && mode != 2) ||
+-      !__jc_store_original) {
+-    if (_cuVectorSJ.IsValid()) {
+-      if (mode == 0)
+-        ProgramCU::ComputeVXY(X, _cuVectorSJ, _cuVectorZK);
+-      else if (mode == 1)
+-        ProgramCU::ComputeVXY(X, _cuVectorSJ, _cuVectorZK, _num_camera * 8);
+-      else if (mode == 2)
+-        ProgramCU::ComputeVXY(X, _cuVectorSJ, _cuVectorZK, _num_point * 4,
+-                              _num_camera * 8);
+-      ProgramCU::ComputeJX_(_cuVectorZK, JX, _cuCameraData, _cuPointData,
+-                            _cuMeasurements, _cuProjectionMap,
+-                            __fixed_intrinsics, __use_radial_distortion, mode);
+-    } else {
+-      ProgramCU::ComputeJX_(X, JX, _cuCameraData, _cuPointData, _cuMeasurements,
+-                            _cuProjectionMap, __fixed_intrinsics,
+-                            __use_radial_distortion, mode);
+-    }
+-  } else {
+-    ProgramCU::ComputeJX(_num_camera * 2, X, _cuJacobianCamera,
+-                         _cuJacobianPoint, _cuProjectionMap, JX, mode);
+-  }
+-
+-  if (_num_imgpt_q > 0 && mode != 2) {
+-    ProgramCU::ComputeJQX(X, _cuCameraQMap, _cuCameraQMapW, _cuVectorSJ, JX,
+-                          _num_imgpt);
+-  }
+-}
+-
+-void SparseBundleCU::ComputeBlockPC(float lambda, bool dampd) {
+-  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_BC, true);
+-
+-  bool use_diagonal_q = _cuCameraQListW.IsValid() && __bundle_current_mode != 2;
+-  if (use_diagonal_q)
+-    ComputeDiagonalQ(_cuCameraQListW, _cuVectorSJ, _cuVectorJJ);
+-
+-  if (__no_jacobian_store || (!__jc_store_original && !__jc_store_transpose)) {
+-    ComputeDiagonalBlock_(
+-        lambda, dampd, _cuCameraData, _cuPointData, _cuMeasurements,
+-        _cuCameraMeasurementMap, _cuCameraMeasurementList,
+-        _cuPointMeasurementMap, _cuProjectionMap, _cuJacobianPoint, _cuVectorSJ,
+-        _cuVectorJJ, _cuBlockPC, __fixed_intrinsics, __use_radial_distortion,
+-        use_diagonal_q, __bundle_current_mode);
+-  } else if (__jc_store_transpose) {
+-    ComputeDiagonalBlock(lambda, dampd, _cuJacobianCameraT,
+-                         _cuCameraMeasurementMap, _cuJacobianPoint,
+-                         _cuPointMeasurementMap, _cuCameraMeasurementList,
+-                         _cuVectorJJ, _cuBlockPC, __use_radial_distortion, true,
+-                         use_diagonal_q, __bundle_current_mode);
+-  } else {
+-    ComputeDiagonalBlock(lambda, dampd, _cuJacobianCamera,
+-                         _cuCameraMeasurementMap, _cuJacobianPoint,
+-                         _cuPointMeasurementMap, _cuCameraMeasurementList,
+-                         _cuVectorJJ, _cuBlockPC, __use_radial_distortion,
+-                         false, use_diagonal_q, __bundle_current_mode);
+-  }
+-}
+-
+-void SparseBundleCU::ApplyBlockPC(CuTexImage& v, CuTexImage& pv, int mode) {
+-  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_MP, true);
+-  MultiplyBlockConditioner(_num_camera, _num_point, _cuBlockPC, v, pv,
+-                           __use_radial_distortion, mode);
+-}
+-
+-void SparseBundleCU::ComputeDiagonal(CuTexImage& JJ, CuTexImage& JJI) {
+-  ////////////////////checking the impossible.
+-  if (__no_jacobian_store) return;
+-  if (!__jc_store_transpose && !__jc_store_original) return;
+-
+-  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_DD, true);
+-  bool use_diagonal_q = _cuCameraQListW.IsValid();
+-  if (use_diagonal_q) {
+-    CuTexImage null;
+-    ComputeDiagonalQ(_cuCameraQListW, null, JJ);
+-  }
+-  if (__jc_store_transpose) {
+-    ProgramCU::ComputeDiagonal(_cuJacobianCameraT, _cuCameraMeasurementMap,
+-                               _cuJacobianPoint, _cuPointMeasurementMap,
+-                               _cuCameraMeasurementList, JJ, JJI, true,
+-                               __use_radial_distortion, use_diagonal_q);
+-  } else {
+-    ProgramCU::ComputeDiagonal(_cuJacobianCamera, _cuCameraMeasurementMap,
+-                               _cuJacobianPoint, _cuPointMeasurementMap,
+-                               _cuCameraMeasurementList, JJ, JJI, false,
+-                               __use_radial_distortion, use_diagonal_q);
+-  }
+-}
+-
+-int SparseBundleCU::SolveNormalEquationPCGX(float lambda) {
+-  //----------------------------------------------------------
+-  //(Jt * J + lambda * diag(Jt * J)) X = Jt * e
+-  //-------------------------------------------------------------
+-  TimerBA timer(this, TIMER_CG_ITERATION);
+-  __recent_cg_status = ' ';
+-
+-  // diagonal for jacobian preconditioning...
+-  int plen = GetParameterLength();
+-  CuTexImage null;
+-  CuTexImage& VectorDP =
+-      __lm_use_diagonal_damp ? _cuVectorJJ : null;  // diagonal
+-  ComputeBlockPC(lambda, __lm_use_diagonal_damp);
+-
+-  ///////////////////////////////////////////////////////
+-  // B = [BC 0 ; 0 BP]
+-  // m = [mc 0; 0 mp];
+-  // A x= BC * x - JcT * Jp * mp * JpT * Jc * x
+-  //   = JcT * Jc x + lambda * D * x + ........
+-  ////////////////////////////////////////////////////////////
+-
+-  CuTexImage r;
+-  r.SetTexture(_cuVectorRK.data(), 8 * _num_camera);
+-  CuTexImage p;
+-  p.SetTexture(_cuVectorPK.data(), 8 * _num_camera);
+-  CuTexImage z;
+-  z.SetTexture(_cuVectorZK.data(), 8 * _num_camera);
+-  CuTexImage x;
+-  x.SetTexture(_cuVectorXK.data(), 8 * _num_camera);
+-  CuTexImage d;
+-  d.SetTexture(VectorDP.data(), 8 * _num_camera);
+-
+-  CuTexImage& u = _cuVectorRK;
+-  CuTexImage& v = _cuVectorPK;
+-  CuTexImage up;
+-  up.SetTexture(u.data() + 8 * _num_camera, 4 * _num_point);
+-  CuTexImage vp;
+-  vp.SetTexture(v.data() + 8 * _num_camera, 4 * _num_point);
+-  CuTexImage uc;
+-  uc.SetTexture(z.data(), 8 * _num_camera);
+-
+-  CuTexImage& e = _cuVectorJX;
+-  CuTexImage& e2 = _cuImageProj;
+-
+-  ApplyBlockPC(_cuVectorJtE, u, 2);
+-  ComputeJX(u, e, 2);
+-  ComputeJtE(e, uc, 1);
+-  ComputeSAXPY(-1.0f, uc, _cuVectorJtE, r);  // r
+-  ApplyBlockPC(r, p, 1);  // z = p = M r
+-
+-  float_t rtz0 = (float_t)ComputeVectorDot(r, p, _cuBufferData);  // r(0)' *
+-                                                                  // z(0)
+-  ComputeJX(p, e, 1);  // Jc * x
+-  ComputeJtE(e, u, 2);  // JpT * jc * x
+-  ApplyBlockPC(u, v, 2);
+-  float_t qtq0 = (float_t)ComputeVectorNorm(e, _cuBufferData);  // q(0)' * q(0)
+-  float_t pdp0 =
+-      (float_t)ComputeVectorNormW(p, d, _cuBufferData);  // p(0)' * DDD * p(0)
+-  float_t uv0 = (float_t)ComputeVectorDot(up, vp, _cuBufferData);
+-  float_t alpha0 = rtz0 / (qtq0 + lambda * pdp0 - uv0);
+-
+-  if (__verbose_cg_iteration)
+-    std::cout << " --0,\t alpha = " << alpha0
+-              << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";
+-  if (!isfinite(alpha0)) {
+-    return 0;
+-  }
+-  if (alpha0 == 0) {
+-    __recent_cg_status = 'I';
+-    return 1;
+-  }
+-
+-  ////////////////////////////////////////////////////////////
+-  ComputeSAX((float)alpha0, p, x);  // x(k+1) = x(k) + a(k) * p(k)
+-  ComputeJX(v, e2, 2);  //                          //Jp * mp * JpT * JcT * p
+-  ComputeSAXPY(-1.0f, e2, e, e);
+-  ComputeJtE(e, uc, 1);  // JcT * ....
+-  ComputeSXYPZ(lambda, d, p, uc, uc);
+-  ComputeSAXPY((float)-alpha0, uc, r, r);  // r(k + 1) = r(k) - a(k) * A * pk
+-
+-  //////////////////////////////////////////////////////////////////////////
+-  float_t rtzk = rtz0, rtz_min = rtz0, betak;
+-  int iteration = 1;
+-  ++__num_cg_iteration;
+-
+-  while (true) {
+-    ApplyBlockPC(r, z, 1);
+-
+-    ///////////////////////////////////////////////////////////////////////////
+-    float_t rtzp = rtzk;
+-    rtzk = (float_t)ComputeVectorDot(
+-        r, z, _cuBufferData);  //[r(k + 1) = M^(-1) * z(k + 1)] * z(k+1)
+-    float_t rtz_ratio = sqrt(fabs(rtzk / rtz0));
+-
+-    if (rtz_ratio < __cg_norm_threshold) {
+-      if (__recent_cg_status == ' ')
+-        __recent_cg_status = iteration < std::min(10, __cg_min_iteration)
+-                                 ? '0' + iteration
+-                                 : 'N';
+-      if (iteration >= __cg_min_iteration) break;
+-    }
+-    ////////////////////////////////////////////////////////////////////////////
+-    betak = rtzk / rtzp;  // beta
+-    rtz_min = std::min(rtz_min, rtzk);
+-
+-    ComputeSAXPY((float)betak, p, z, p);  // p(k) = z(k) + b(k) * p(k - 1)
+-    ComputeJX(p, e, 1);  // Jc * p
+-    ComputeJtE(e, u, 2);  // JpT * jc * p
+-    ApplyBlockPC(u, v, 2);
+-    //////////////////////////////////////////////////////////////////////
+-
+-    float_t qtqk = (float_t)ComputeVectorNorm(e, _cuBufferData);  // q(k)' q(k)
+-    float_t pdpk =
+-        (float_t)ComputeVectorNormW(p, d, _cuBufferData);  // p(k)' * DDD * p(k)
+-    float_t uvk = (float_t)ComputeVectorDot(up, vp, _cuBufferData);
+-    float_t alphak = rtzk / (qtqk + lambda * pdpk - uvk);
+-
+-    /////////////////////////////////////////////////////
+-    if (__verbose_cg_iteration)
+-      std::cout << " --" << iteration << ",\t alpha= " << alphak
+-                << ", rtzk/rtz0 = " << rtz_ratio
+-                << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";
+-
+-    ///////////////////////////////////////////////////
+-    if (!isfinite(alphak) || rtz_ratio > __cg_norm_guard) {
+-      __recent_cg_status = 'X';
+-      break;
+-    }  // something doesn't converge..
+-
+-    ////////////////////////////////////////////////
+-    ComputeSAXPY((float)alphak, p, x, x);  // x(k+1) = x(k) + a(k) * p(k)
+-
+-    /////////////////////////////////////////////////
+-    ++iteration;
+-    ++__num_cg_iteration;
+-    if (iteration >= std::min(__cg_max_iteration, plen)) break;
+-
+-    ComputeJX(v, e2, 2);  //                          //Jp * mp * JpT * JcT * p
+-    ComputeSAXPY(-1.0f, e2, e, e);
+-    ComputeJtE(e, uc, 1);  // JcT * ....
+-    ComputeSXYPZ(lambda, d, p, uc, uc);
+-    ComputeSAXPY((float)-alphak, uc, r, r);  // r(k + 1) = r(k) - a(k) * A * pk
+-  }
+-
+-  // if(__recent_cg_status == 'X')     return iteration;
+-
+-  ComputeJX(x, e, 1);
+-  ComputeJtE(e, u, 2);
+-  CuTexImage jte_p;
+-  jte_p.SetTexture(_cuVectorJtE.data() + 8 * _num_camera, _num_point * 4);
+-  ComputeSAXPY(-1.0f, up, jte_p, vp);
+-  ApplyBlockPC(v, _cuVectorXK, 2);
+-  return iteration;
+-}
+-int SparseBundleCU::SolveNormalEquationPCGB(float lambda) {
+-  //----------------------------------------------------------
+-  //(Jt * J + lambda * diag(Jt * J)) X = Jt * e
+-  //-------------------------------------------------------------
+-  TimerBA timer(this, TIMER_CG_ITERATION);
+-  __recent_cg_status = ' ';
+-
+-  // diagonal for jacobian preconditioning...
+-  int plen = GetParameterLength();
+-  CuTexImage null;
+-  CuTexImage& VectorDP =
+-      __lm_use_diagonal_damp ? _cuVectorJJ : null;  // diagonal
+-  CuTexImage& VectorQK = _cuVectorZK;  // temporary
+-  ComputeBlockPC(lambda, __lm_use_diagonal_damp);
+-
+-  ////////////////////////////////////////////////////////
+-  ApplyBlockPC(_cuVectorJtE,
+-               _cuVectorPK);  // z(0) = p(0) = M * r(0)//r(0) = Jt * e
+-  ComputeJX(_cuVectorPK, _cuVectorJX);  // q(0) = J * p(0)
+-
+-  //////////////////////////////////////////////////
+-  float_t rtz0 = (float_t)ComputeVectorDot(_cuVectorJtE, _cuVectorPK,
+-                                           _cuBufferData);  // r(0)' * z(0)
+-  float_t qtq0 =
+-      (float_t)ComputeVectorNorm(_cuVectorJX, _cuBufferData);  // q(0)' * q(0)
+-  float_t ptdp0 = (float_t)ComputeVectorNormW(
+-      _cuVectorPK, VectorDP, _cuBufferData);  // p(0)' * DDD * p(0)
+-  float_t alpha0 = rtz0 / (qtq0 + lambda * ptdp0);
+-
+-  if (__verbose_cg_iteration)
+-    std::cout << " --0,\t alpha = " << alpha0
+-              << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";
+-  if (!isfinite(alpha0)) {
+-    return 0;
+-  }
+-  if (alpha0 == 0) {
+-    __recent_cg_status = 'I';
+-    return 1;
+-  }
+-
+-  ////////////////////////////////////////////////////////////
+-  ComputeSAX((float)alpha0, _cuVectorPK,
+-             _cuVectorXK);  // x(k+1) = x(k) + a(k) * p(k)
+-  ComputeJtE(_cuVectorJX, VectorQK);  // Jt * (J * p0)
+-
+-  ComputeSXYPZ(lambda, VectorDP, _cuVectorPK, VectorQK,
+-               VectorQK);  // Jt * J * p0 + lambda * DDD * p0
+-  ComputeSAXPY(
+-      (float)-alpha0, VectorQK, _cuVectorJtE,
+-      _cuVectorRK);  // r(k+1) = r(k) - a(k) * (Jt * q(k)  + DDD * p(k)) ;
+-
+-  float_t rtzk = rtz0, rtz_min = rtz0, betak;
+-  int iteration = 1;
+-  ++__num_cg_iteration;
+-
+-  while (true) {
+-    ApplyBlockPC(_cuVectorRK, _cuVectorZK);
+-
+-    ///////////////////////////////////////////////////////////////////////////
+-    float_t rtzp = rtzk;
+-    rtzk = (float_t)ComputeVectorDot(
+-        _cuVectorRK, _cuVectorZK,
+-        _cuBufferData);  //[r(k + 1) = M^(-1) * z(k + 1)] * z(k+1)
+-    float_t rtz_ratio = sqrt(fabs(rtzk / rtz0));
+-    if (rtz_ratio < __cg_norm_threshold) {
+-      if (__recent_cg_status == ' ')
+-        __recent_cg_status = iteration < std::min(10, __cg_min_iteration)
+-                                 ? '0' + iteration
+-                                 : 'N';
+-      if (iteration >= __cg_min_iteration) break;
+-    }
+-
+-    ////////////////////////////////////////////////////////////////////////////
+-    betak = rtzk / rtzp;  // beta
+-    rtz_min = std::min(rtz_min, rtzk);
+-
+-    ComputeSAXPY((float)betak, _cuVectorPK, _cuVectorZK,
+-                 _cuVectorPK);  // p(k) = z(k) + b(k) * p(k - 1)
+-    ComputeJX(_cuVectorPK, _cuVectorJX);  // q(k) = J * p(k)
+-    //////////////////////////////////////////////////////////////////////
+-
+-    float_t qtqk =
+-        (float_t)ComputeVectorNorm(_cuVectorJX, _cuBufferData);  // q(k)' q(k)
+-    float_t ptdpk = (float_t)ComputeVectorNormW(
+-        _cuVectorPK, VectorDP, _cuBufferData);  // p(k)' * DDD * p(k)
+-    float_t alphak = rtzk / (qtqk + lambda * ptdpk);
+-
+-    /////////////////////////////////////////////////////
+-    if (__verbose_cg_iteration)
+-      std::cout << " --" << iteration << ",\t alpha= " << alphak
+-                << ", rtzk/rtz0 = " << rtz_ratio
+-                << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";
+-
+-    ///////////////////////////////////////////////////
+-    if (!isfinite(alphak) || rtz_ratio > __cg_norm_guard) {
+-      __recent_cg_status = 'X';
+-      break;
+-    }  // something doesn't converge..
+-
+-    ////////////////////////////////////////////////
+-    ComputeSAXPY((float)alphak, _cuVectorPK, _cuVectorXK,
+-                 _cuVectorXK);  // x(k+1) = x(k) + a(k) * p(k)
+-
+-    /////////////////////////////////////////////////
+-    ++iteration;
+-    ++__num_cg_iteration;
+-    if (iteration >= std::min(__cg_max_iteration, plen)) break;
+-
+-    // if(iteration == 2 && rtz_ratio < __cg_norm_threshold)
+-    if (__cg_recalculate_freq > 0 && iteration % __cg_recalculate_freq == 0) {
+-      ////r = JtE - (Jt J + lambda * D) x
+-      ComputeJX(_cuVectorXK, _cuVectorJX);
+-      ComputeJtE(_cuVectorJX, VectorQK);
+-      ComputeSXYPZ(lambda, VectorDP, _cuVectorXK, VectorQK, VectorQK);
+-      ComputeSAXPY(-1.0f, VectorQK, _cuVectorJtE, _cuVectorRK);
+-    } else {
+-      ComputeJtE(_cuVectorJX, VectorQK);
+-      ComputeSXYPZ(lambda, VectorDP, _cuVectorPK, VectorQK, VectorQK);  //
+-      ComputeSAXPY(
+-          (float)-alphak, VectorQK, _cuVectorRK,
+-          _cuVectorRK);  // r(k+1) = r(k) - a(k) * (Jt * q(k)  + DDD * p(k)) ;
+-    }
+-  }
+-  return iteration;
+-}
+-
+-int SparseBundleCU::SolveNormalEquation(float lambda) {
+-  if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
+-    ComputeBlockPC(lambda, __lm_use_diagonal_damp);
+-    ApplyBlockPC(_cuVectorJtE, _cuVectorXK, 1);
+-    return 1;
+-  } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
+-    ComputeBlockPC(lambda, __lm_use_diagonal_damp);
+-    ApplyBlockPC(_cuVectorJtE, _cuVectorXK, 2);
+-    return 1;
+-  } else {
+-    ////solve linear system using Conjugate Gradients
+-    return __cg_schur_complement ? SolveNormalEquationPCGX(lambda)
+-                                 : SolveNormalEquationPCGB(lambda);
+-  }
+-}
+-
+-void SparseBundleCU::RunTestIterationLM(bool reduced) {
+-  EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
+-  EvaluateJacobians();
+-  ComputeJtE(_cuImageProj, _cuVectorJtE);
+-  if (reduced)
+-    SolveNormalEquationPCGX(__lm_initial_damp);
+-  else
+-    SolveNormalEquationPCGB(__lm_initial_damp);
+-  UpdateCameraPoint(_cuVectorZK, _cuImageProj);
+-  ComputeVectorDot(_cuVectorXK, _cuVectorJtE, _cuBufferData);
+-  ComputeJX(_cuVectorXK, _cuVectorJX);
+-  ComputeVectorNorm(_cuVectorJX, _cuBufferData);
+-}
+-
+-float SparseBundleCU::UpdateCameraPoint(CuTexImage& dx,
+-                                        CuTexImage& cuImageTempProj) {
+-  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_UP, true);
+-  if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
+-    if (__jacobian_normalize)
+-      ComputeVXY(_cuVectorXK, _cuVectorSJ, dx, 8 * _num_camera);
+-    ProgramCU::UpdateCameraPoint(_num_camera, _cuCameraData, _cuPointData, dx,
+-                                 _cuCameraDataEX, _cuPointDataEX,
+-                                 __bundle_current_mode);
+-    return EvaluateProjection(_cuCameraDataEX, _cuPointData, cuImageTempProj);
+-  } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
+-    if (__jacobian_normalize)
+-      ComputeVXY(_cuVectorXK, _cuVectorSJ, dx, 4 * _num_point, 8 * _num_camera);
+-    ProgramCU::UpdateCameraPoint(_num_camera, _cuCameraData, _cuPointData, dx,
+-                                 _cuCameraDataEX, _cuPointDataEX,
+-                                 __bundle_current_mode);
+-    return EvaluateProjection(_cuCameraData, _cuPointDataEX, cuImageTempProj);
+-  } else {
+-    if (__jacobian_normalize) ComputeVXY(_cuVectorXK, _cuVectorSJ, dx);
+-    ProgramCU::UpdateCameraPoint(_num_camera, _cuCameraData, _cuPointData, dx,
+-                                 _cuCameraDataEX, _cuPointDataEX,
+-                                 __bundle_current_mode);
+-    return EvaluateProjection(_cuCameraDataEX, _cuPointDataEX, cuImageTempProj);
+-  }
+-}
+-
+-float SparseBundleCU::SaveUpdatedSystem(float residual_reduction,
+-                                        float dx_sqnorm, float damping) {
+-  float expected_reduction;
+-  if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
+-    CuTexImage xk;
+-    xk.SetTexture(_cuVectorXK.data(), 8 * _num_camera);
+-    CuTexImage jte;
+-    jte.SetTexture(_cuVectorJtE.data(), 8 * _num_camera);
+-    float dxtg = (float)ComputeVectorDot(xk, jte, _cuBufferData);
+-    if (__lm_use_diagonal_damp) {
+-      CuTexImage jj;
+-      jj.SetTexture(_cuVectorJJ.data(), 8 * _num_camera);
+-      float dq = (float)ComputeVectorNormW(xk, jj, _cuBufferData);
+-      expected_reduction = damping * dq + dxtg;
+-    } else {
+-      expected_reduction = damping * dx_sqnorm + dxtg;
+-    }
+-    _cuCameraData.SwapData(_cuCameraDataEX);
+-  } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
+-    CuTexImage xk;
+-    xk.SetTexture(_cuVectorXK.data() + 8 * _num_camera, 4 * _num_point);
+-    CuTexImage jte;
+-    jte.SetTexture(_cuVectorJtE.data() + 8 * _num_camera, 4 * _num_point);
+-    float dxtg = (float)ComputeVectorDot(xk, jte, _cuBufferData);
+-    if (__lm_use_diagonal_damp) {
+-      CuTexImage jj;
+-      jj.SetTexture(_cuVectorJJ.data() + 8 * _num_camera, 4 * _num_point);
+-      float dq = (float)ComputeVectorNormW(xk, jj, _cuBufferData);
+-      expected_reduction = damping * dq + dxtg;
+-    } else {
+-      expected_reduction = damping * dx_sqnorm + dxtg;
+-    }
+-    _cuPointData.SwapData(_cuPointDataEX);
+-  } else {
+-    float dxtg =
+-        (float)ComputeVectorDot(_cuVectorXK, _cuVectorJtE, _cuBufferData);
+-
+-    if (__accurate_gain_ratio) {
+-      ComputeJX(_cuVectorXK, _cuVectorJX);
+-      float njx = (float)ComputeVectorNorm(_cuVectorJX, _cuBufferData);
+-      expected_reduction = 2.0f * dxtg - njx;
+-      // could the expected reduction be negative??? not sure
+-      if (expected_reduction <= 0)
+-        expected_reduction = 0.001f * residual_reduction;
+-    } else if (__lm_use_diagonal_damp) {
+-      float dq =
+-          (float)ComputeVectorNormW(_cuVectorXK, _cuVectorJJ, _cuBufferData);
+-      expected_reduction = damping * dq + dxtg;
+-    } else {
+-      expected_reduction = damping * dx_sqnorm + dxtg;
+-    }
+-
+-    /// save the new motion/struture
+-    _cuCameraData.SwapData(_cuCameraDataEX);
+-    _cuPointData.SwapData(_cuPointDataEX);
+-
+-    //_cuCameraData.CopyToHost(_camera_data);
+-    //_cuPointData.CopyToHost(_point_data);
+-    // DebugProjections();
+-  }
+-  ////////////////////////////////////////////
+-  return float(residual_reduction / expected_reduction);
+-}
+-
+-void SparseBundleCU::AdjustBundleAdjsutmentMode() {
+-  if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
+-    _cuJacobianCamera.InitTexture(0, 0);
+-    _cuJacobianCameraT.InitTexture(0, 0);
+-  }
+-}
+-
+-float SparseBundleCU::EvaluateDeltaNorm() {
+-  if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
+-    CuTexImage temp;
+-    temp.SetTexture(_cuVectorXK.data(), 8 * _num_camera);
+-    return ComputeVectorNorm(temp, _cuBufferData);
+-
+-  } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
+-    CuTexImage temp;
+-    temp.SetTexture(_cuVectorXK.data() + 8 * _num_camera, 4 * _num_point);
+-    return ComputeVectorNorm(temp, _cuBufferData);
+-  } else {
+-    return (float)ComputeVectorNorm(_cuVectorXK, _cuBufferData);
+-  }
+-}
+-
+-void SparseBundleCU::NonlinearOptimizeLM() {
+-  ////////////////////////////////////////
+-  TimerBA timer(this, TIMER_OPTIMIZATION);
+-
+-  ////////////////////////////////////////////////
+-  float mse_convert_ratio =
+-      1.0f / (_num_imgpt * __focal_scaling * __focal_scaling);
+-  float error_display_ratio = __verbose_sse ? _num_imgpt : 1.0f;
+-  const int edwidth = __verbose_sse ? 12 : 8;
+-  _projection_sse =
+-      EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
+-  __initial_mse = __final_mse = _projection_sse * mse_convert_ratio;
+-
+-  // compute jacobian diagonals for normalization
+-  if (__jacobian_normalize) PrepareJacobianNormalization();
+-
+-  // evalaute jacobian
+-  EvaluateJacobians();
+-  ComputeJtE(_cuImageProj, _cuVectorJtE);
+-  ///////////////////////////////////////////////////////////////
+-  if (__verbose_level)
+-    std::cout << "Initial " << (__verbose_sse ? "sumed" : "mean")
+-              << " squared error = " << __initial_mse * error_display_ratio
+-              << "\n----------------------------------------------\n";
+-
+-  //////////////////////////////////////////////////
+-  CuTexImage& cuImageTempProj = _cuVectorJX;
+-  // CuTexImage& cuVectorTempJX  =   _cuVectorJX;
+-  CuTexImage& cuVectorDX = _cuVectorSJ.IsValid() ? _cuVectorZK : _cuVectorXK;
+-
+-  //////////////////////////////////////////////////
+-  float damping_adjust = 2.0f, damping = __lm_initial_damp, g_norm, g_inf;
+-  SaveBundleRecord(0, _projection_sse * mse_convert_ratio, damping, g_norm,
+-                   g_inf);
+-
+-  ////////////////////////////////////
+-  std::cout << std::left;
+-  for (int i = 0; i < __lm_max_iteration && !__abort_flag;
+-       __current_iteration = (++i)) {
+-    ////solve linear system
+-    int num_cg_iteration = SolveNormalEquation(damping);
+-
+-    // there must be NaN somewhere
+-    if (num_cg_iteration == 0) {
+-      if (__verbose_level)
+-        std::cout << "#" << std::setw(3) << i << " quit on numeric errors\n";
+-      __pba_return_code = 'E';
+-      break;
+-    }
+-
+-    // there must be infinity somewhere
+-    if (__recent_cg_status == 'I') {
+-      std::cout << "#" << std::setw(3) << i << " 0  I e=" << std::setw(edwidth)
+-                << "------- "
+-                << " u=" << std::setprecision(3) << std::setw(9) << damping
+-                << '\n' << std::setprecision(6);
+-      /////////////increase damping factor
+-      damping = damping * damping_adjust;
+-      damping_adjust = 2.0f * damping_adjust;
+-      --i;
+-      continue;
+-    }
+-
+-    /////////////////////
+-    ++__num_lm_iteration;
+-
+-    ////////////////////////////////////
+-    float dx_sqnorm = EvaluateDeltaNorm(), dx_norm = sqrt(dx_sqnorm);
+-
+-    // In this library, we check absolute difference instead of realtive
+-    // difference
+-    if (dx_norm <= __lm_delta_threshold) {
+-      // damping factor must be way too big...or it converges
+-      if (__verbose_level > 1)
+-        std::cout << "#" << std::setw(3) << i << " " << std::setw(3)
+-                  << num_cg_iteration << char(__recent_cg_status)
+-                  << " quit on too small change (" << dx_norm << "  < "
+-                  << __lm_delta_threshold << ")\n";
+-      __pba_return_code = 'S';
+-      break;
+-    }
+-    ///////////////////////////////////////////////////////////////////////
+-    // update structure and motion, check reprojection error
+-    float new_residual = UpdateCameraPoint(cuVectorDX, cuImageTempProj);
+-    float average_residual = new_residual * mse_convert_ratio;
+-    float residual_reduction = _projection_sse - new_residual;
+-
+-    // do we find a better solution?
+-    if (isfinite(new_residual) && residual_reduction > 0) {
+-      ////compute relative norm change
+-      float relative_reduction = 1.0f - (new_residual / _projection_sse);
+-
+-      ////////////////////////////////////
+-      __num_lm_success++;  // increase counter
+-      _projection_sse = new_residual;  // save the new residual
+-      _cuImageProj.SwapData(cuImageTempProj);  // save the new projection
+-
+-      ///////////////gain ratio////////////////////
+-      float gain_ratio =
+-          SaveUpdatedSystem(residual_reduction, dx_sqnorm, damping);
+-
+-      /////////////////////////////////////
+-      SaveBundleRecord(i + 1, _projection_sse * mse_convert_ratio, damping,
+-                       g_norm, g_inf);
+-
+-      /////////////////////////////////////////////
+-      if (__verbose_level > 1)
+-        std::cout << "#" << std::setw(3) << i << " " << std::setw(3)
+-                  << num_cg_iteration << char(__recent_cg_status)
+-                  << " e=" << std::setw(edwidth)
+-                  << average_residual * error_display_ratio
+-                  << " u=" << std::setprecision(3) << std::setw(9) << damping
+-                  << " r=" << std::setw(6)
+-                  << floor(gain_ratio * 1000.f) * 0.001f
+-                  << " g=" << std::setw(g_norm > 0 ? 9 : 1) << g_norm << " "
+-                  << std::setw(9) << relative_reduction << ' ' << std::setw(9)
+-                  << dx_norm << " t=" << int(BundleTimerGetNow()) << "\n"
+-                  << std::setprecision(6);
+-
+-      /////////////////////////////
+-      if (!IsTimeBudgetAvailable()) {
+-        if (__verbose_level > 1)
+-          std::cout << "#" << std::setw(3) << i << " used up time budget.\n";
+-        __pba_return_code = 'T';
+-        break;
+-      } else if (__lm_check_gradient && g_inf < __lm_gradient_threshold) {
+-        if (__verbose_level > 1)
+-          std::cout << "#" << std::setw(3) << i
+-                    << " converged with small gradient\n";
+-        __pba_return_code = 'G';
+-        break;
+-      } else if (average_residual * error_display_ratio <= __lm_mse_threshold) {
+-        if (__verbose_level > 1)
+-          std::cout << "#" << std::setw(3) << i << " satisfies MSE threshold\n";
+-        __pba_return_code = 'M';
+-        break;
+-      } else {
+-        /////////////////////////////adjust damping factor
+-        float temp = gain_ratio * 2.0f - 1.0f;
+-        float adaptive_adjust = 1.0f - temp * temp * temp;  // powf(, 3.0f); //
+-        float auto_adjust = std::max(1.0f / 3.0f, adaptive_adjust);
+-
+-        //////////////////////////////////////////////////
+-        damping = damping * auto_adjust;
+-        damping_adjust = 2.0f;
+-        if (damping < __lm_minimum_damp)
+-          damping = __lm_minimum_damp;
+-        else if (__lm_damping_auto_switch == 0 && damping > __lm_maximum_damp &&
+-                 __lm_use_diagonal_damp)
+-          damping = __lm_maximum_damp;
+-
+-        EvaluateJacobians();
+-        ComputeJtE(_cuImageProj, _cuVectorJtE);
+-      }
+-    } else {
+-      if (__verbose_level > 1)
+-        std::cout << "#" << std::setw(3) << i << " " << std::setw(3)
+-                  << num_cg_iteration << char(__recent_cg_status)
+-                  << " e=" << std::setw(edwidth) << std::left
+-                  << average_residual * error_display_ratio
+-                  << " u=" << std::setprecision(3) << std::setw(9) << damping
+-                  << " r=----- " << (__lm_check_gradient || __save_gradient_norm
+-                                         ? " g=---------"
+-                                         : " g=0")
+-                  << " --------- " << std::setw(9) << dx_norm
+-                  << " t=" << int(BundleTimerGetNow()) << "\n"
+-                  << std::setprecision(6);
+-
+-      if (__lm_damping_auto_switch > 0 && __lm_use_diagonal_damp &&
+-          damping > __lm_damping_auto_switch) {
+-        __lm_use_diagonal_damp = false;
+-        damping = __lm_damping_auto_switch;
+-        damping_adjust = 2.0f;
+-        if (__verbose_level > 1)
+-          std::cout << "NOTE: switch to damping with an identity matix\n";
+-      } else {
+-        /////////////increase damping factor
+-        damping = damping * damping_adjust;
+-        damping_adjust = 2.0f * damping_adjust;
+-      }
+-    }
+-
+-    if (__verbose_level == 1) std::cout << '.';
+-  }
+-
+-  __final_mse = float(_projection_sse * mse_convert_ratio);
+-  __final_mse_x =
+-      __use_radial_distortion
+-          ? EvaluateProjectionX(_cuCameraData, _cuPointData, _cuImageProj) *
+-                mse_convert_ratio
+-          : __final_mse;
+-}
+-
+-#define PROFILE_(A, B)                    \
+-  BundleTimerStart(TIMER_PROFILE_STEP);   \
+-  for (int i = 0; i < repeat; ++i) {      \
+-    B;                                    \
+-    FinishWorkCUDA();                     \
+-  }                                       \
+-  BundleTimerSwitch(TIMER_PROFILE_STEP);  \
+-  std::cout << std::setw(24) << A << ": " \
+-            << (BundleTimerGet(TIMER_PROFILE_STEP) / repeat) << "\n";
+-
+-#define PROFILE(A, B) PROFILE_(#A, A B)
+-#define PROXILE(A, B) PROFILE_(A, B)
+-
+-void SparseBundleCU::RunProfileSteps() {
+-  const int repeat = __profile_pba;
+-  std::cout << "---------------------------------\n"
+-               "|    Run profiling steps ("
+-            << repeat << ")  |\n"
+-                         "---------------------------------\n"
+-            << std::left;
+-  ;
+-
+-  ///////////////////////////////////////////////
+-  PROXILE("Upload Measurements",
+-          _cuMeasurements.CopyFromHost(
+-              _imgpt_datax.size() > 0 ? &_imgpt_datax[0] : _imgpt_data));
+-  PROXILE("Upload Point Data", _cuPointData.CopyToHost(_point_data));
+-  std::cout << "---------------------------------\n";
+-
+-  /////////////////////////////////////////////
+-  EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
+-  PrepareJacobianNormalization();
+-  EvaluateJacobians();
+-  ComputeJtE(_cuImageProj, _cuVectorJtE);
+-  ComputeBlockPC(__lm_initial_damp, true);
+-  FinishWorkCUDA();
+-
+-  do {
+-    if (SolveNormalEquationPCGX(__lm_initial_damp) == 10 &&
+-        SolveNormalEquationPCGB(__lm_initial_damp) == 10)
+-      break;
+-    __lm_initial_damp *= 2.0f;
+-  } while (__lm_initial_damp < 1024.0f);
+-  std::cout << "damping set to " << __lm_initial_damp << " for profiling\n"
+-            << "---------------------------------\n";
+-
+-  {
+-    int repeat = 10, cgmin = __cg_min_iteration, cgmax = __cg_max_iteration;
+-    __cg_max_iteration = __cg_min_iteration = 10;
+-    __num_cg_iteration = 0;
+-    PROFILE(SolveNormalEquationPCGX, (__lm_initial_damp));
+-    if (__num_cg_iteration != 100)
+-      std::cout << __num_cg_iteration << " cg iterations in all\n";
+-
+-    /////////////////////////////////////////////////////////////////////
+-    __num_cg_iteration = 0;
+-    PROFILE(SolveNormalEquationPCGB, (__lm_initial_damp));
+-    if (__num_cg_iteration != 100)
+-      std::cout << __num_cg_iteration << " cg iterations in all\n";
+-    std::cout << "---------------------------------\n";
+-    //////////////////////////////////////////////////////
+-    __num_cg_iteration = 0;
+-    PROXILE("Single iteration LMX", RunTestIterationLM(true));
+-    if (__num_cg_iteration != 100)
+-      std::cout << __num_cg_iteration << " cg iterations in all\n";
+-    ////////////////////////////////////////////////////////
+-    __num_cg_iteration = 0;
+-    PROXILE("Single iteration LMB", RunTestIterationLM(false));
+-    if (__num_cg_iteration != 100)
+-      std::cout << __num_cg_iteration << " cg iterations in all\n";
+-    std::cout << "---------------------------------\n";
+-    __cg_max_iteration = cgmax;
+-    __cg_min_iteration = cgmin;
+-  }
+-  /////////////////////////////////////////////////////
+-  PROFILE(UpdateCameraPoint, (_cuVectorZK, _cuImageProj));
+-  PROFILE(ComputeVectorNorm, (_cuVectorXK, _cuBufferData));
+-  PROFILE(ComputeVectorDot, (_cuVectorXK, _cuVectorRK, _cuBufferData));
+-  PROFILE(ComputeVectorNormW, (_cuVectorXK, _cuVectorRK, _cuBufferData));
+-  PROFILE(ComputeSAXPY, (0.01f, _cuVectorXK, _cuVectorRK, _cuVectorZK));
+-  PROFILE(ComputeSXYPZ,
+-          (0.01f, _cuVectorXK, _cuVectorPK, _cuVectorRK, _cuVectorZK));
+-  std::cout << "---------------------------------\n";
+-  PROFILE(ComputeVectorNorm, (_cuImageProj, _cuBufferData));
+-  PROFILE(ComputeSAXPY, (0.000f, _cuImageProj, _cuVectorJX, _cuVectorJX));
+-  std::cout << "---------------------------------\n";
+-
+-  __multiply_jx_usenoj = false;
+-  ///////////////////////////////////////////////////////
+-  PROFILE(EvaluateProjection, (_cuCameraData, _cuPointData, _cuImageProj));
+-  PROFILE(ApplyBlockPC, (_cuVectorJtE, _cuVectorPK));
+-  /////////////////////////////////////////////////
+-  if (!__no_jacobian_store) {
+-    if (__jc_store_original) {
+-      PROFILE(ComputeJX, (_cuVectorJtE, _cuVectorJX));
+-      PROFILE(EvaluateJacobians, (false));
+-
+-      if (__jc_store_transpose) {
+-        PROFILE(
+-            ShuffleCameraJacobian,
+-            (_cuJacobianCamera, _cuCameraMeasurementList, _cuJacobianCameraT));
+-        PROFILE(ComputeDiagonal, (_cuVectorJJ, _cuVectorPK));
+-        PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE));
+-        PROFILE(ComputeBlockPC, (0.001f, true));
+-
+-        std::cout << "---------------------------------\n"
+-                     "|   Not storing original  JC    | \n"
+-                     "---------------------------------\n";
+-        __jc_store_original = false;
+-        PROFILE(EvaluateJacobians, ());
+-        __jc_store_original = true;
+-      }
+-      //////////////////////////////////////////////////
+-
+-      std::cout << "---------------------------------\n"
+-                   "|   Not storing transpose JC    | \n"
+-                   "---------------------------------\n";
+-      __jc_store_transpose = false;
+-      PROFILE(ComputeDiagonal, (_cuVectorJJ, _cuVectorPK));
+-      PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE));
+-      PROFILE(ComputeBlockPC, (0.001f, true));
+-
+-      //////////////////////////////////////
+-
+-    } else if (__jc_store_transpose) {
+-      PROFILE(ComputeDiagonal, (_cuVectorJJ, _cuVectorPK));
+-      PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE));
+-      PROFILE(ComputeBlockPC, (0.001f, true));
+-      std::cout << "---------------------------------\n"
+-                   "|   Not storing original  JC    | \n"
+-                   "---------------------------------\n";
+-      PROFILE(EvaluateJacobians, ());
+-    }
+-  }
+-
+-  if (!__no_jacobian_store) {
+-    std::cout << "---------------------------------\n"
+-                 "| Not storing Camera Jacobians  | \n"
+-                 "---------------------------------\n";
+-    __jc_store_transpose = false;
+-    __jc_store_original = false;
+-    _cuJacobianCamera.ReleaseData();
+-    _cuJacobianCameraT.ReleaseData();
+-    PROFILE(EvaluateJacobians, ());
+-    PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE));
+-    PROFILE(ComputeBlockPC, (0.001f, true));
+-  }
+-
+-  ///////////////////////////////////////////////
+-
+-  std::cout << "---------------------------------\n"
+-               "|   Not storing any jacobians   |\n"
+-               "---------------------------------\n";
+-  __no_jacobian_store = true;
+-  _cuJacobianPoint.ReleaseData();
+-  PROFILE(ComputeJX, (_cuVectorJtE, _cuVectorJX));
+-  PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE));
+-  PROFILE(ComputeBlockPC, (0.001f, true));
+-
+-  std::cout << "---------------------------------\n";
+-}
+-
+-void SparseBundleCU::RunDebugSteps() {
+-  EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
+-  EvaluateJacobians();
+-  ComputeJtE(_cuImageProj, _cuVectorJtE);
+-  // DEBUG_FUNCN(_cuVectorXK, SolveNormalEquationPCGB, (0.001f), 100);
+-  DEBUG_FUNCN(_cuVectorJtE, ComputeJtE, (_cuImageProj, _cuVectorJtE), 100);
+-  DEBUG_FUNCN(_cuVectorJX, ComputeJX, (_cuVectorJtE, _cuVectorJX), 100);
+-}
+-
+-void SparseBundleCU::SaveNormalEquation(float lambda) {
+-  ofstream out1("../../matlab/cg_j.txt");
+-  ofstream out2("../../matlab/cg_b.txt");
+-  ofstream out3("../../matlab/cg_x.txt");
+-
+-  out1 << std::setprecision(20);
+-  out2 << std::setprecision(20);
+-  out3 << std::setprecision(20);
+-
+-  int plen = GetParameterLength();
+-  vector<float> jc(16 * _num_imgpt);
+-  vector<float> jp(8 * _num_imgpt);
+-  vector<float> ee(2 * _num_imgpt);
+-  vector<float> dx(plen);
+-
+-  _cuJacobianCamera.CopyToHost(&jc[0]);
+-  _cuJacobianPoint.CopyToHost(&jp[0]);
+-  _cuImageProj.CopyToHost(&ee[0]);
+-  _cuVectorXK.CopyToHost(&dx[0]);
+-
+-  for (int i = 0; i < _num_imgpt; ++i) {
+-    out2 << ee[i * 2] << ' ' << ee[i * 2 + 1] << ' ';
+-    int cidx = _camera_idx[i], pidx = _point_idx[i];
+-    float *cp = &jc[i * 16], *pp = &jp[i * 8];
+-    int cmin = cidx * 8, pmin = 8 * _num_camera + pidx * 4;
+-    for (int j = 0; j < 8; ++j)
+-      out1 << (i * 2 + 1) << ' ' << (cmin + j + 1) << ' ' << cp[j] << '\n';
+-    for (int j = 0; j < 8; ++j)
+-      out1 << (i * 2 + 2) << ' ' << (cmin + j + 1) << ' ' << cp[j + 8] << '\n';
+-    for (int j = 0; j < 4; ++j)
+-      out1 << (i * 2 + 1) << ' ' << (pmin + j + 1) << ' ' << pp[j] << '\n';
+-    for (int j = 0; j < 4; ++j)
+-      out1 << (i * 2 + 2) << ' ' << (pmin + j + 1) << ' ' << pp[j + 4] << '\n';
+-  }
+-
+-  for (size_t i = 0; i < dx.size(); ++i) out3 << dx[i] << ' ';
+-
+-  std::cout << "lambda = " << std::setprecision(20) << lambda << '\n';
+-}
+-
+-}  // namespace pba
+diff --git a/lib/PBA/SparseBundleCU.h b/lib/PBA/SparseBundleCU.h
+deleted file mode 100644
+index 7183deb67..000000000
+--- a/lib/PBA/SparseBundleCU.h
++++ /dev/null
+@@ -1,176 +0,0 @@
+-////////////////////////////////////////////////////////////////////////////
+-//  File:       SparseBundleCU.h
+-//  Author:       Changchang Wu (ccwu@cs.washington.edu)
+-//  Description :   interface of the CUDA-version of multicore bundle
+-// adjustment
+-//
+-//  Copyright (c) 2011  Changchang Wu (ccwu@cs.washington.edu)
+-//    and the University of Washington at Seattle
+-//
+-//  This library is free software; you can redistribute it and/or
+-//  modify it under the terms of the GNU General Public
+-//  License as published by the Free Software Foundation; either
+-//  Version 3 of the License, or (at your option) any later version.
+-//
+-//  This library is distributed in the hope that it will be useful,
+-//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+-//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-//  General Public License for more details.
+-//
+-////////////////////////////////////////////////////////////////////////////////
+-
+-#if !defined(SPARSE_BUNDLE_CU_H)
+-#define SPARSE_BUNDLE_CU_H
+-
+-#include "ConfigBA.h"
+-#include "CuTexImage.h"
+-#include "DataInterface.h"
+-
+-namespace pba {
+-
+-class SparseBundleCU : public ParallelBA, public ConfigBA {
+- protected:  // cpu data
+-  int _num_camera;
+-  int _num_point;
+-  int _num_imgpt;
+-  CameraT* _camera_data;
+-  float* _point_data;
+-  ////////////////////////////////
+-  const float* _imgpt_data;
+-  const int* _camera_idx;
+-  const int* _point_idx;
+-  const int* _focal_mask;
+-  std::vector<float> _imgpt_datax;
+-  ////////////////////////
+-  float _projection_sse;  // sumed square error
+- protected:               // cuda data
+-  CuTexImage _cuCameraData;
+-  CuTexImage _cuCameraDataEX;
+-  CuTexImage _cuPointData;
+-  CuTexImage _cuPointDataEX;
+-  CuTexImage _cuMeasurements;
+-  CuTexImage _cuImageProj;
+-  CuTexImage _cuJacobianCamera;
+-  CuTexImage _cuJacobianPoint;
+-  CuTexImage _cuJacobianCameraT;
+-  CuTexImage _cuProjectionMap;
+-  CuTexImage _cuPointMeasurementMap;
+-  CuTexImage _cuCameraMeasurementMap;
+-  CuTexImage _cuCameraMeasurementList;
+-  CuTexImage _cuCameraMeasurementListT;
+-
+-  ///////////////////////////////
+-  CuTexImage _cuBufferData;
+-  ////////////////////////////
+-  CuTexImage _cuBlockPC;
+-  CuTexImage _cuVectorSJ;
+-
+-  /// LM normal    equation
+-  CuTexImage _cuVectorJtE;
+-  CuTexImage _cuVectorJJ;
+-  CuTexImage _cuVectorJX;
+-  CuTexImage _cuVectorXK;
+-  CuTexImage _cuVectorPK;
+-  CuTexImage _cuVectorZK;
+-  CuTexImage _cuVectorRK;
+-
+-  ///////////////////////
+- protected:
+-  int _num_imgpt_q;
+-  float _weight_q;
+-  CuTexImage _cuCameraQList;
+-  CuTexImage _cuCameraQMap;
+-  CuTexImage _cuCameraQMapW;
+-  CuTexImage _cuCameraQListW;
+-
+- protected:
+-  bool ProcessIndexCameraQ(std::vector<int>& qmap, std::vector<int>& qlist);
+-  void ProcessWeightCameraQ(std::vector<int>& cpnum, std::vector<int>& qmap,
+-                            std::vector<float>& qmapw,
+-                            std::vector<float>& qlistw);
+-
+- protected:  // internal functions
+-  int GetParameterLength();
+-  int InitializeBundle();
+-  int ValidateInputData();
+-  void ReleaseAllocatedData();
+-  bool InitializeStorageForCG();
+-  bool InitializeBundleGPU();
+-  bool TransferDataToGPU();
+-  void TransferDataToHost();
+-  void DenormalizeData();
+-  void NormalizeData();
+-  void NormalizeDataF();
+-  void NormalizeDataD();
+-  void DebugProjections();
+-  void RunDebugSteps();
+-  bool CheckRequiredMem(int fresh = 1);
+-  bool CheckRequiredMemX();
+-  void ReserveStorage(size_t ncam, size_t npt, size_t nproj);
+-  void ReserveStorageAuto();
+-
+- protected:
+-  float EvaluateProjection(CuTexImage& cam, CuTexImage& point,
+-                           CuTexImage& proj);
+-  float EvaluateProjectionX(CuTexImage& cam, CuTexImage& point,
+-                            CuTexImage& proj);
+-  float UpdateCameraPoint(CuTexImage& dx, CuTexImage& cuImageTempProj);
+-  float SaveUpdatedSystem(float residual_reduction, float dx_sqnorm,
+-                          float damping);
+-  float EvaluateDeltaNorm();
+-  void EvaluateJacobians(bool shuffle = true);
+-  void PrepareJacobianNormalization();
+-  void ComputeJtE(CuTexImage& E, CuTexImage& JtE, int mode = 0);
+-  void ComputeJX(CuTexImage& X, CuTexImage& JX, int mode = 0);
+-  void ComputeDiagonal(CuTexImage& JJ, CuTexImage& JJI);
+-  void ComputeBlockPC(float lambda, bool dampd = true);
+-  void ApplyBlockPC(CuTexImage& v, CuTexImage& pv, int mode = 0);
+-  int SolveNormalEquationPCGB(float lambda);
+-  int SolveNormalEquationPCGX(float lambda);
+-  int SolveNormalEquation(float lambda);
+-  void AdjustBundleAdjsutmentMode();
+-  void NonlinearOptimizeLM();
+-  void BundleAdjustment();
+-  void RunTestIterationLM(bool reduced);
+-  void SaveBundleRecord(int iter, float res, float damping, float& g_norm,
+-                        float& g_inf);
+-  /////////////////////////////////
+-  void SaveNormalEquation(float lambda);
+-  void RunProfileSteps();
+-  void WarmupDevice();
+-
+- public:
+-  virtual float GetMeanSquaredError();
+-  virtual void SetCameraData(size_t ncam, CameraT* cams);
+-  virtual void SetPointData(size_t npoint, Point3D* pts);
+-  virtual void SetProjection(size_t nproj, const Point2D* imgpts,
+-                             const int* point_idx, const int* cam_idx);
+-  virtual void SetFocalMask(const int* fmask, float weight);
+-  virtual int RunBundleAdjustment();
+-
+-  ///
+-  virtual void AbortBundleAdjustment() { __abort_flag = true; }
+-  virtual int GetCurrentIteration() { return __current_iteration; }
+-  virtual void SetNextTimeBudget(int seconds) {
+-    __bundle_time_budget = seconds;
+-  }
+-  virtual void SetNextBundleMode(BundleModeT mode) {
+-    __bundle_mode_next = mode;
+-  }
+-  virtual void SetFixedIntrinsics(bool fixed) { __fixed_intrinsics = fixed; }
+-  virtual void EnableRadialDistortion(DistortionT type) {
+-    __use_radial_distortion = type;
+-  }
+-  virtual void ParseParam(int narg, char** argv) {
+-    ConfigBA::ParseParam(narg, argv);
+-  }
+-  virtual ConfigBA* GetInternalConfig() { return this; }
+-
+- public:
+-  SparseBundleCU(int device);
+-  size_t GetMemCapacity();
+-};
+-
+-}  // namespace pba
+-
+-#endif
+diff --git a/lib/PBA/pba.cpp b/lib/PBA/pba.cpp
+deleted file mode 100644
+index 77d62b070..000000000
+--- a/lib/PBA/pba.cpp
++++ /dev/null
+@@ -1,134 +0,0 @@
+-////////////////////////////////////////////////////////////////////////////
+-//  File:           pba.cpp
+-//  Author:         Changchang Wu
+-//  Description :   implementation of ParallelBA, which is a wrapper around
+-//                  the GPU-based and CPU-based implementations
+-//
+-//  Copyright (c) 2011  Changchang Wu (ccwu@cs.washington.edu)
+-//    and the University of Washington at Seattle
+-//
+-//  This library is free software; you can redistribute it and/or
+-//  modify it under the terms of the GNU General Public
+-//  License as published by the Free Software Foundation; either
+-//  Version 3 of the License, or (at your option) any later version.
+-//
+-//  This library is distributed in the hope that it will be useful,
+-//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+-//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-//  General Public License for more details.
+-//
+-////////////////////////////////////////////////////////////////////////////////
+-#include <stdlib.h>
+-#include <new>
+-#include "pba.h"
+-#include "SparseBundleCU.h"
+-#include "SparseBundleCPU.h"
+-
+-namespace pba {
+-
+-ParallelBA::ParallelBA(DeviceT device, const int num_threads) {
+-  // The wrapper intends to provide different implementations.
+-
+-  if (device >= PBA_CUDA_DEVICE_DEFAULT)
+-#ifndef PBA_NO_GPU
+-  {
+-    SparseBundleCU* cuba = new SparseBundleCU(device - PBA_CUDA_DEVICE0);
+-    if (cuba->GetMemCapacity() > 0) {
+-      _optimizer = cuba;
+-    } else {
+-      device = PBA_CPU_FLOAT;
+-      _optimizer = NewSparseBundleCPU(false, num_threads);
+-      delete cuba;
+-    }
+-  } else
+-#else
+-    device = PBA_CPU_FLOAT;
+-#endif
+-      if (device == PBA_CPU_FLOAT)
+-    _optimizer = NewSparseBundleCPU(false, num_threads);
+-  else if (device == PBA_CPU_DOUBLE)
+-    _optimizer = NewSparseBundleCPU(true, num_threads);
+-  else
+-    _optimizer = NULL;
+-}
+-
+-ParallelBA::~ParallelBA() {
+-  if (_optimizer) delete _optimizer;
+-}
+-
+-void ParallelBA::ParseParam(int narg, char** argv) {
+-  _optimizer->ParseParam(narg, argv);
+-}
+-
+-ConfigBA* ParallelBA::GetInternalConfig() {
+-  if (_optimizer)
+-    return _optimizer->GetInternalConfig();
+-  else
+-    return NULL;
+-}
+-
+-void ParallelBA::SetFixedIntrinsics(bool fixed) {
+-  _optimizer->SetFixedIntrinsics(fixed);
+-}
+-void ParallelBA::EnableRadialDistortion(DistortionT enabled) {
+-  _optimizer->EnableRadialDistortion(enabled);
+-}
+-void ParallelBA::SetNextTimeBudget(int seconds) {
+-  _optimizer->SetNextTimeBudget(seconds);
+-}
+-
+-void ParallelBA::SetNextBundleMode(BundleModeT mode) {
+-  _optimizer->SetNextBundleMode(mode);
+-}
+-
+-void ParallelBA::SetCameraData(size_t ncam, CameraT* cams) {
+-  _optimizer->SetCameraData(ncam, cams);
+-}
+-
+-void ParallelBA::SetPointData(size_t npoint, Point3D* pts) {
+-  _optimizer->SetPointData(npoint, pts);
+-}
+-
+-void ParallelBA::SetProjection(size_t nproj, const Point2D* imgpts,
+-                               const int* point_idx, const int* cam_idx) {
+-  _optimizer->SetProjection(nproj, imgpts, point_idx, cam_idx);
+-}
+-int ParallelBA::RunBundleAdjustment() {
+-  return _optimizer->RunBundleAdjustment();
+-}
+-
+-float ParallelBA::GetMeanSquaredError() {
+-  return _optimizer->GetMeanSquaredError();
+-}
+-
+-int ParallelBA::GetCurrentIteration() {
+-  return _optimizer->GetCurrentIteration();
+-}
+-void ParallelBA::AbortBundleAdjustment() {
+-  return _optimizer->AbortBundleAdjustment();
+-}
+-
+-void ParallelBA::ReserveStorage(size_t ncam, size_t npt, size_t nproj) {
+-  if (_optimizer) _optimizer->ReserveStorage(ncam, npt, nproj);
+-}
+-
+-void ParallelBA::SetFocalMask(const int* fmask, float weight) {
+-  if (_optimizer && weight > 0) _optimizer->SetFocalMask(fmask, weight);
+-}
+-
+-// void* ParallelBA::operator new(size_t size) {
+-//   void* p = malloc(size);
+-//   if (p == 0) {
+-//     const std::bad_alloc ba;
+-//     throw ba;
+-//   }
+-//   return p;
+-// }
+-
+-ParallelBA* NewParallelBA(ParallelBA::DeviceT device) {
+-  return new ParallelBA(device);
+-}
+-
+-int ParallelBA_GetVersion() { return 105; }
+-
+-}  // namespace pba
+diff --git a/lib/PBA/pba.h b/lib/PBA/pba.h
+deleted file mode 100644
+index 3ebf5813f..000000000
+--- a/lib/PBA/pba.h
++++ /dev/null
+@@ -1,156 +0,0 @@
+-////////////////////////////////////////////////////////////////////////////
+-//  File:       pba.h
+-//  Author:       Changchang Wu (ccwu@cs.washington.edu)
+-//  Description :   interface of class ParallelBA, which has two
+-//implementations
+-//                  SparseBundleCU for CUDA-based version, and
+-//                  SparseBundleCPU<Float> for CPU multi-threading version
+-//
+-//  Copyright (c) 2011  Changchang Wu (ccwu@cs.washington.edu)
+-//    and the University of Washington at Seattle
+-//
+-//  This library is free software; you can redistribute it and/or
+-//  modify it under the terms of the GNU General Public
+-//  License as published by the Free Software Foundation; either
+-//  Version 3 of the License, or (at your option) any later version.
+-//
+-//  This library is distributed in the hope that it will be useful,
+-//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+-//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-//  General Public License for more details.
+-//
+-////////////////////////////////////////////////////////////////////////////////
+-
+-#ifndef PARALLEL_BA_H
+-#define PARALLEL_BA_H
+-
+-#if defined(_WIN32)
+-#ifdef PBA_DLL
+-#ifdef DLL_EXPORT
+-#define PBA_EXPORT __declspec(dllexport)
+-#else
+-#define PBA_EXPORT __declspec(dllimport)
+-#endif
+-#else
+-#define PBA_EXPORT
+-#endif
+-
+-#define PBA_EXPORT_EXTERN PBA_EXPORT
+-
+-#if _MSC_VER > 1000
+-#pragma once
+-#endif
+-#else
+-#define PBA_EXPORT
+-#define PBA_EXPORT_EXTERN extern "C"
+-#endif
+-
+-// filetype definitions for points and camera
+-#include "DataInterface.h"
+-#include "ConfigBA.h"
+-
+-namespace pba {
+-
+-class ParallelBA {
+- public:
+-  enum StatusT {
+-    STATUS_SUCCESS = 0,
+-    STATUS_CAMERA_MISSING = 1,
+-    STATUS_POINT_MISSING,
+-    STATUS_PROJECTION_MISSING,
+-    STATUS_MEASURMENT_MISSING,
+-    STATUS_ALLOCATION_FAIL
+-  };
+-  enum DeviceT {
+-    PBA_INVALID_DEVICE = -4,
+-    PBA_CPU_DOUBLE = -3,
+-    PBA_CPU_FLOAT = -2,
+-    PBA_CUDA_DEVICE_DEFAULT = -1,
+-    PBA_CUDA_DEVICE0 = 0
+-  };
+-  enum DistortionT {
+-    PBA_MEASUREMENT_DISTORTION = -1,  // single parameter, apply to measurements
+-    PBA_NO_DISTORTION = 0,  // no radial distortion
+-    PBA_PROJECTION_DISTORTION = 1  // single parameter, apply to projectino
+-  };
+-  enum BundleModeT {
+-    BUNDLE_FULL = 0,
+-    BUNDLE_ONLY_MOTION = 1,
+-    BUNDLE_ONLY_STRUCTURE = 2,
+-  };
+-
+- private:
+-  ParallelBA* _optimizer;
+-
+- public:
+-  ////////////////////////////////////////////////////
+-  // methods for changing bundle adjustment settings
+-  PBA_EXPORT virtual void ParseParam(int narg, char** argv);  // indirect method
+-  PBA_EXPORT virtual ConfigBA* GetInternalConfig();  // direct method
+-  PBA_EXPORT virtual void SetFixedIntrinsics(
+-      bool fixed);  // call this for calibrated system
+-  PBA_EXPORT virtual void EnableRadialDistortion(
+-      DistortionT type);  // call this to enable radial distortion
+-  PBA_EXPORT virtual void SetNextTimeBudget(
+-      int seconds);  //# of seconds for next run (0 = no limit)
+-  PBA_EXPORT virtual void ReserveStorage(size_t ncam, size_t npt, size_t nproj);
+-
+- public:
+-  // function name change; the old one is mapped as inline function
+-  inline void SetFocalLengthFixed(bool fixed) { SetFixedIntrinsics(fixed); }
+-  inline void ResetBundleStorage() {
+-    ReserveStorage(0, 0, 0); /*Reset devide for CUDA*/
+-  }
+-
+- public:
+-  /////////////////////////////////////////////////////
+-  // optimizer interface, input and run
+-  PBA_EXPORT virtual void SetCameraData(size_t ncam,
+-                                        CameraT* cams);  // set camera data
+-  PBA_EXPORT virtual void SetPointData(size_t npoint,
+-                                       Point3D* pts);  // set 3D point data
+-  PBA_EXPORT virtual void SetProjection(size_t nproj, const Point2D* imgpts,
+-                                        const int* point_idx,
+-                                        const int* cam_idx);  // set projections
+-  PBA_EXPORT virtual void SetNextBundleMode(
+-      BundleModeT
+-          mode = BUNDLE_FULL);  // mode of the next bundle adjustment call
+-  PBA_EXPORT virtual int RunBundleAdjustment();  // start bundle adjustment,
+-                                                 // return number of successful
+-                                                 // LM iterations
+- public:
+-  //////////////////////////////////////////////////
+-  // Query optimzer runing status for Multi-threading
+-  // Three functions below can be called from a differnt thread while bundle is
+-  // running
+-  PBA_EXPORT virtual float
+-  GetMeanSquaredError();  // read back results during/after BA
+-  PBA_EXPORT virtual void
+-  AbortBundleAdjustment();  // tell bundle adjustment to abort ASAP
+-  PBA_EXPORT virtual int
+-  GetCurrentIteration();  // which iteration is it working on?
+- public:
+-  PBA_EXPORT ParallelBA(DeviceT device = PBA_CUDA_DEVICE_DEFAULT,
+-                        const int num_threads = -1);
+-  // PBA_EXPORT void* operator new(size_t size);
+-  PBA_EXPORT virtual ~ParallelBA();
+-
+- public:
+-  //////////////////////////////////////////////
+-  // Future functions will be added to the end for compatiability with old
+-  // version.
+-  PBA_EXPORT virtual void SetFocalMask(const int* fmask, float weight = 1.0f);
+-};
+-
+-// function for dynamic loading of library
+-PBA_EXPORT_EXTERN ParallelBA* NewParallelBA(
+-    ParallelBA::DeviceT device = ParallelBA::PBA_CUDA_DEVICE_DEFAULT);
+-typedef ParallelBA* (*NEWPARALLELBAPROC)(ParallelBA::DeviceT);
+-
+-///////////////////////////////////////////////
+-// older versions do not have this function.
+-PBA_EXPORT_EXTERN int ParallelBA_GetVersion();
+-
+-}  // namespace pba
+-
+-#endif
+diff --git a/lib/PBA/util.h b/lib/PBA/util.h
+deleted file mode 100644
+index a63c8bbce..000000000
+--- a/lib/PBA/util.h
++++ /dev/null
+@@ -1,753 +0,0 @@
+-////////////////////////////////////////////////////////////////////////////
+-//  File:       util.h
+-//  Author:       Changchang Wu (ccwu@cs.washington.edu)
+-//  Description :   some utility functions for reading/writing SfM data
+-//
+-//  Copyright (c) 2011  Changchang Wu (ccwu@cs.washington.edu)
+-//    and the University of Washington at Seattle
+-//
+-//  This library is free software; you can redistribute it and/or
+-//  modify it under the terms of the GNU General Public
+-//  License as published by the Free Software Foundation; either
+-//  Version 3 of the License, or (at your option) any later version.
+-//
+-//  This library is distributed in the hope that it will be useful,
+-//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+-//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-//  General Public License for more details.
+-//
+-////////////////////////////////////////////////////////////////////////////////
+-
+-#include <iostream>
+-#include <fstream>
+-#include <vector>
+-#include <string>
+-#include <math.h>
+-#include <time.h>
+-#include <iomanip>
+-#include <algorithm>
+-using namespace std;
+-#include "DataInterface.h"
+-
+-namespace pba {
+-
+-// File loader supports .nvm format and bundler format
+-bool LoadModelFile(const char* name, vector<CameraT>& camera_data,
+-                   vector<Point3D>& point_data, vector<Point2D>& measurements,
+-                   vector<int>& ptidx, vector<int>& camidx,
+-                   vector<string>& names, vector<int>& ptc);
+-void SaveNVM(const char* filename, vector<CameraT>& camera_data,
+-             vector<Point3D>& point_data, vector<Point2D>& measurements,
+-             vector<int>& ptidx, vector<int>& camidx, vector<string>& names,
+-             vector<int>& ptc);
+-void SaveBundlerModel(const char* filename, vector<CameraT>& camera_data,
+-                      vector<Point3D>& point_data,
+-                      vector<Point2D>& measurements, vector<int>& ptidx,
+-                      vector<int>& camidx);
+-
+-//////////////////////////////////////////////////////////////////
+-void AddNoise(vector<CameraT>& camera_data, vector<Point3D>& point_data,
+-              float percent);
+-void AddStableNoise(vector<CameraT>& camera_data, vector<Point3D>& point_data,
+-                    const vector<int>& ptidx, const vector<int>& camidx,
+-                    float percent);
+-bool RemoveInvisiblePoints(vector<CameraT>& camera_data,
+-                           vector<Point3D>& point_data, vector<int>& ptidx,
+-                           vector<int>& camidx, vector<Point2D>& measurements,
+-                           vector<string>& names, vector<int>& ptc);
+-
+-/////////////////////////////////////////////////////////////////////////////
+-bool LoadNVM(ifstream& in, vector<CameraT>& camera_data,
+-             vector<Point3D>& point_data, vector<Point2D>& measurements,
+-             vector<int>& ptidx, vector<int>& camidx, vector<string>& names,
+-             vector<int>& ptc) {
+-  int rotation_parameter_num = 4;
+-  bool format_r9t = false;
+-  string token;
+-  if (in.peek() == 'N') {
+-    in >> token;  // file header
+-    if (strstr(token.c_str(), "R9T")) {
+-      rotation_parameter_num = 9;  // rotation as 3x3 matrix
+-      format_r9t = true;
+-    }
+-  }
+-
+-  int ncam = 0, npoint = 0, nproj = 0;
+-  // read # of cameras
+-  in >> ncam;
+-  if (ncam <= 1) return false;
+-
+-  // read the camera parameters
+-  camera_data.resize(ncam);  // allocate the camera data
+-  names.resize(ncam);
+-  for (int i = 0; i < ncam; ++i) {
+-    double f, q[9], c[3], d[2];
+-    in >> token >> f;
+-    for (int j = 0; j < rotation_parameter_num; ++j) in >> q[j];
+-    in >> c[0] >> c[1] >> c[2] >> d[0] >> d[1];
+-
+-    camera_data[i].SetFocalLength(f);
+-    if (format_r9t) {
+-      camera_data[i].SetMatrixRotation(q);
+-      camera_data[i].SetTranslation(c);
+-    } else {
+-      // older format for compability
+-      camera_data[i].SetQuaternionRotation(q);  // quaternion from the file
+-      camera_data[i].SetCameraCenterAfterRotation(
+-          c);  // camera center from the file
+-    }
+-    camera_data[i].SetNormalizedMeasurementDistortion(d[0]);
+-    names[i] = token;
+-  }
+-
+-  //////////////////////////////////////
+-  in >> npoint;
+-  if (npoint <= 0) return false;
+-
+-  // read image projections and 3D points.
+-  point_data.resize(npoint);
+-  for (int i = 0; i < npoint; ++i) {
+-    float pt[3];
+-    int cc[3], npj;
+-    in >> pt[0] >> pt[1] >> pt[2] >> cc[0] >> cc[1] >> cc[2] >> npj;
+-    for (int j = 0; j < npj; ++j) {
+-      int cidx, fidx;
+-      float imx, imy;
+-      in >> cidx >> fidx >> imx >> imy;
+-
+-      camidx.push_back(cidx);  // camera index
+-      ptidx.push_back(i);  // point index
+-
+-      // add a measurment to the vector
+-      measurements.push_back(Point2D(imx, imy));
+-      nproj++;
+-    }
+-    point_data[i].SetPoint(pt);
+-    ptc.insert(ptc.end(), cc, cc + 3);
+-  }
+-  ///////////////////////////////////////////////////////////////////////////////
+-  std::cout << ncam << " cameras; " << npoint << " 3D points; " << nproj
+-            << " projections\n";
+-
+-  return true;
+-}
+-
+-void SaveNVM(const char* filename, vector<CameraT>& camera_data,
+-             vector<Point3D>& point_data, vector<Point2D>& measurements,
+-             vector<int>& ptidx, vector<int>& camidx, vector<string>& names,
+-             vector<int>& ptc) {
+-  std::cout << "Saving model to " << filename << "...\n";
+-  ofstream out(filename);
+-
+-  out << "NVM_V3_R9T\n" << camera_data.size() << '\n' << std::setprecision(12);
+-  if (names.size() < camera_data.size())
+-    names.resize(camera_data.size(), string("unknown"));
+-  if (ptc.size() < 3 * point_data.size()) ptc.resize(point_data.size() * 3, 0);
+-
+-  ////////////////////////////////////
+-  for (size_t i = 0; i < camera_data.size(); ++i) {
+-    CameraT& cam = camera_data[i];
+-    out << names[i] << ' ' << cam.GetFocalLength() << ' ';
+-    for (int j = 0; j < 9; ++j) out << cam.m[0][j] << ' ';
+-    out << cam.t[0] << ' ' << cam.t[1] << ' ' << cam.t[2] << ' '
+-        << cam.GetNormalizedMeasurementDistortion() << " 0\n";
+-  }
+-
+-  out << point_data.size() << '\n';
+-
+-  for (size_t i = 0, j = 0; i < point_data.size(); ++i) {
+-    Point3D& pt = point_data[i];
+-    int* pc = &ptc[i * 3];
+-    out << pt.xyz[0] << ' ' << pt.xyz[1] << ' ' << pt.xyz[2] << ' ' << pc[0]
+-        << ' ' << pc[1] << ' ' << pc[2] << ' ';
+-
+-    size_t je = j;
+-    while (je < ptidx.size() && ptidx[je] == (int)i) je++;
+-
+-    out << (je - j) << ' ';
+-
+-    for (; j < je; ++j)
+-      out << camidx[j] << ' ' << " 0 " << measurements[j].x << ' '
+-          << measurements[j].y << ' ';
+-
+-    out << '\n';
+-  }
+-}
+-
+-bool LoadBundlerOut(const char* name, ifstream& in,
+-                    vector<CameraT>& camera_data, vector<Point3D>& point_data,
+-                    vector<Point2D>& measurements, vector<int>& ptidx,
+-                    vector<int>& camidx, vector<string>& names,
+-                    vector<int>& ptc) {
+-  int rotation_parameter_num = 9;
+-  string token;
+-  while (in.peek() == '#') std::getline(in, token);
+-
+-  char listpath[1024], filepath[1024];
+-  strcpy(listpath, name);
+-  char* ext = strstr(listpath, ".out");
+-  strcpy(ext, "-list.txt\0");
+-
+-  ///////////////////////////////////
+-  ifstream listin(listpath);
+-  if (!listin.is_open()) {
+-    listin.close();
+-    listin.clear();
+-    char* slash = strrchr(listpath, '/');
+-    if (slash == NULL) slash = strrchr(listpath, '\\');
+-    slash = slash ? slash + 1 : listpath;
+-    strcpy(slash, "image_list.txt");
+-    listin.open(listpath);
+-  }
+-  if (listin) std::cout << "Using image list: " << listpath << '\n';
+-
+-  // read # of cameras
+-  int ncam = 0, npoint = 0, nproj = 0;
+-  in >> ncam >> npoint;
+-  if (ncam <= 1 || npoint <= 1) return false;
+-  std::cout << ncam << " cameras; " << npoint << " 3D points;\n";
+-
+-  // read the camera parameters
+-  camera_data.resize(ncam);  // allocate the camera data
+-  names.resize(ncam);
+-
+-  bool det_checked = false;
+-  for (int i = 0; i < ncam; ++i) {
+-    float f, q[9], c[3], d[2];
+-    in >> f >> d[0] >> d[1];
+-    for (int j = 0; j < rotation_parameter_num; ++j) in >> q[j];
+-    in >> c[0] >> c[1] >> c[2];
+-
+-    camera_data[i].SetFocalLength(f);
+-    camera_data[i].SetInvertedR9T(q, c);
+-    camera_data[i].SetProjectionDistortion(d[0]);
+-
+-    if (listin >> filepath && f != 0) {
+-      char* slash = strrchr(filepath, '/');
+-      if (slash == NULL) slash = strchr(filepath, '\\');
+-      names[i] = (slash ? (slash + 1) : filepath);
+-      std::getline(listin, token);
+-
+-      if (!det_checked) {
+-        float det = camera_data[i].GetRotationMatrixDeterminant();
+-        std::cout << "Check rotation matrix: " << det << '\n';
+-        det_checked = true;
+-      }
+-    } else {
+-      names[i] = "unknown";
+-    }
+-  }
+-
+-  // read image projections and 3D points.
+-  point_data.resize(npoint);
+-  for (int i = 0; i < npoint; ++i) {
+-    float pt[3];
+-    int cc[3], npj;
+-    in >> pt[0] >> pt[1] >> pt[2] >> cc[0] >> cc[1] >> cc[2] >> npj;
+-    for (int j = 0; j < npj; ++j) {
+-      int cidx, fidx;
+-      float imx, imy;
+-      in >> cidx >> fidx >> imx >> imy;
+-
+-      camidx.push_back(cidx);  // camera index
+-      ptidx.push_back(i);  // point index
+-
+-      // add a measurment to the vector
+-      measurements.push_back(Point2D(imx, -imy));
+-      nproj++;
+-    }
+-    point_data[i].SetPoint(pt[0], pt[1], pt[2]);
+-    ptc.insert(ptc.end(), cc, cc + 3);
+-  }
+-  ///////////////////////////////////////////////////////////////////////////////
+-  std::cout << ncam << " cameras; " << npoint << " 3D points; " << nproj
+-            << " projections\n";
+-  return true;
+-}
+-
+-void SaveBundlerOut(const char* filename, vector<CameraT>& camera_data,
+-                    vector<Point3D>& point_data, vector<Point2D>& measurements,
+-                    vector<int>& ptidx, vector<int>& camidx,
+-                    vector<string>& names, vector<int>& ptc) {
+-  char listpath[1024];
+-  strcpy(listpath, filename);
+-  char* ext = strstr(listpath, ".out");
+-  if (ext == NULL) return;
+-  strcpy(ext, "-list.txt\0");
+-
+-  ofstream out(filename);
+-  out << "# Bundle file v0.3\n";
+-  out << std::setprecision(12);  // need enough precision
+-  out << camera_data.size() << " " << point_data.size() << '\n';
+-
+-  // save camera data
+-  for (size_t i = 0; i < camera_data.size(); ++i) {
+-    float q[9], c[3];
+-    CameraT& ci = camera_data[i];
+-    out << ci.GetFocalLength() << ' ' << ci.GetProjectionDistortion() << " 0\n";
+-    ci.GetInvertedR9T(q, c);
+-    for (int j = 0; j < 9; ++j) out << q[j] << (((j % 3) == 2) ? '\n' : ' ');
+-    out << c[0] << ' ' << c[1] << ' ' << c[2] << '\n';
+-  }
+-  ///
+-  for (size_t i = 0, j = 0; i < point_data.size(); ++i) {
+-    int npj = 0, *ci = &ptc[i * 3];
+-    Point3D& pt = point_data[i];
+-    while (j + npj < point_data.size() && ptidx[j + npj] == ptidx[j]) npj++;
+-    ///////////////////////////
+-    out << pt.xyz[0] << ' ' << pt.xyz[1] << ' ' << pt.xyz[2] << '\n';
+-    out << ci[0] << ' ' << ci[1] << ' ' << ci[2] << '\n';
+-    out << npj << ' ';
+-    for (int k = 0; k < npj; ++k)
+-      out << camidx[j + k] << " 0 " << measurements[j + k].x << ' '
+-          << -measurements[j + k].y << '\n';
+-    out << '\n';
+-    j += npj;
+-  }
+-
+-  ofstream listout(listpath);
+-  for (size_t i = 0; i < names.size(); ++i) listout << names[i] << '\n';
+-}
+-
+-template <class CameraT, class Point3D>
+-bool LoadBundlerModel(ifstream& in, vector<CameraT>& camera_data,
+-                      vector<Point3D>& point_data,
+-                      vector<Point2D>& measurements, vector<int>& ptidx,
+-                      vector<int>& camidx) {
+-  // read bundle data from a file
+-  size_t ncam = 0, npt = 0, nproj = 0;
+-  if (!(in >> ncam >> npt >> nproj)) return false;
+-  ///////////////////////////////////////////////////////////////////////////////
+-  std::cout << ncam << " cameras; " << npt << " 3D points; " << nproj
+-            << " projections\n";
+-
+-  camera_data.resize(ncam);
+-  point_data.resize(npt);
+-  measurements.resize(nproj);
+-  camidx.resize(nproj);
+-  ptidx.resize(nproj);
+-
+-  for (size_t i = 0; i < nproj; ++i) {
+-    double x, y;
+-    int cidx, pidx;
+-    in >> cidx >> pidx >> x >> y;
+-    if (((size_t)pidx) == npt && camidx.size() > i) {
+-      camidx.resize(i);
+-      ptidx.resize(i);
+-      measurements.resize(i);
+-      std::cout << "Truncate measurements to " << i << '\n';
+-    } else if (((size_t)pidx) >= npt) {
+-      continue;
+-    } else {
+-      camidx[i] = cidx;
+-      ptidx[i] = pidx;
+-      measurements[i].SetPoint2D(x, -y);
+-    }
+-  }
+-
+-  for (size_t i = 0; i < ncam; ++i) {
+-    double p[9];
+-    for (int j = 0; j < 9; ++j) in >> p[j];
+-    CameraT& cam = camera_data[i];
+-    cam.SetFocalLength(p[6]);
+-    cam.SetInvertedRT(p, p + 3);
+-    cam.SetProjectionDistortion(p[7]);
+-  }
+-
+-  for (size_t i = 0; i < npt; ++i) {
+-    double pt[3];
+-    in >> pt[0] >> pt[1] >> pt[2];
+-    point_data[i].SetPoint(pt);
+-  }
+-  return true;
+-}
+-
+-void SaveBundlerModel(const char* filename, vector<CameraT>& camera_data,
+-                      vector<Point3D>& point_data,
+-                      vector<Point2D>& measurements, vector<int>& ptidx,
+-                      vector<int>& camidx) {
+-  std::cout << "Saving model to " << filename << "...\n";
+-  ofstream out(filename);
+-  out << std::setprecision(12);  // need enough precision
+-  out << camera_data.size() << ' ' << point_data.size() << ' '
+-      << measurements.size() << '\n';
+-  for (size_t i = 0; i < measurements.size(); ++i) {
+-    out << camidx[i] << ' ' << ptidx[i] << ' ' << measurements[i].x << ' '
+-        << -measurements[i].y << '\n';
+-  }
+-
+-  for (size_t i = 0; i < camera_data.size(); ++i) {
+-    CameraT& cam = camera_data[i];
+-    double r[3], t[3];
+-    cam.GetInvertedRT(r, t);
+-    out << r[0] << ' ' << r[1] << ' ' << r[2] << ' ' << t[0] << ' ' << t[1]
+-        << ' ' << t[2] << ' ' << cam.f << ' ' << cam.GetProjectionDistortion()
+-        << " 0\n";
+-  }
+-
+-  for (size_t i = 0; i < point_data.size(); ++i) {
+-    Point3D& pt = point_data[i];
+-    out << pt.xyz[0] << ' ' << pt.xyz[1] << ' ' << pt.xyz[2] << '\n';
+-  }
+-}
+-
+-bool LoadModelFile(const char* name, vector<CameraT>& camera_data,
+-                   vector<Point3D>& point_data, vector<Point2D>& measurements,
+-                   vector<int>& ptidx, vector<int>& camidx,
+-                   vector<string>& names, vector<int>& ptc) {
+-  if (name == NULL) return false;
+-  ifstream in(name);
+-
+-  std::cout << "Loading cameras/points: " << name << "\n";
+-  if (!in.is_open()) return false;
+-
+-  if (strstr(name, ".nvm"))
+-    return LoadNVM(in, camera_data, point_data, measurements, ptidx, camidx,
+-                   names, ptc);
+-  else if (strstr(name, ".out"))
+-    return LoadBundlerOut(name, in, camera_data, point_data, measurements,
+-                          ptidx, camidx, names, ptc);
+-  else
+-    return LoadBundlerModel(in, camera_data, point_data, measurements, ptidx,
+-                            camidx);
+-}
+-
+-float random_ratio(float percent) {
+-  return (rand() % 101 - 50) * 0.02f * percent + 1.0f;
+-}
+-
+-void AddNoise(vector<CameraT>& camera_data, vector<Point3D>& point_data,
+-              float percent) {
+-  std::srand((unsigned int)time(NULL));
+-  for (size_t i = 0; i < camera_data.size(); ++i) {
+-    camera_data[i].f *= random_ratio(percent);
+-    camera_data[i].t[0] *= random_ratio(percent);
+-    camera_data[i].t[1] *= random_ratio(percent);
+-    camera_data[i].t[2] *= random_ratio(percent);
+-    double e[3];
+-    camera_data[i].GetRodriguesRotation(e);
+-    e[0] *= random_ratio(percent);
+-    e[1] *= random_ratio(percent);
+-    e[2] *= random_ratio(percent);
+-    camera_data[i].SetRodriguesRotation(e);
+-  }
+-
+-  for (size_t i = 0; i < point_data.size(); ++i) {
+-    point_data[i].xyz[0] *= random_ratio(percent);
+-    point_data[i].xyz[1] *= random_ratio(percent);
+-    point_data[i].xyz[2] *= random_ratio(percent);
+-  }
+-}
+-
+-void AddStableNoise(vector<CameraT>& camera_data, vector<Point3D>& point_data,
+-                    const vector<int>& ptidx, const vector<int>& camidx,
+-                    float percent) {
+-  ///
+-  std::srand((unsigned int)time(NULL));
+-  // do not modify the visibility status..
+-  vector<float> zz0(ptidx.size());
+-  vector<CameraT> backup = camera_data;
+-  vector<float> vx(point_data.size()), vy(point_data.size()),
+-      vz(point_data.size());
+-  for (size_t i = 0; i < point_data.size(); ++i) {
+-    Point3D& pt = point_data[i];
+-    vx[i] = pt.xyz[0];
+-    vy[i] = pt.xyz[1];
+-    vz[i] = pt.xyz[2];
+-  }
+-
+-  // find out the median location of all the 3D points.
+-  size_t median_idx = point_data.size() / 2;
+-
+-  std::nth_element(vx.begin(), vx.begin() + median_idx, vx.end());
+-  std::nth_element(vy.begin(), vy.begin() + median_idx, vy.end());
+-  std::nth_element(vz.begin(), vz.begin() + median_idx, vz.end());
+-  float cx = vx[median_idx], cy = vy[median_idx], cz = vz[median_idx];
+-
+-  for (size_t i = 0; i < ptidx.size(); ++i) {
+-    CameraT& cam = camera_data[camidx[i]];
+-    Point3D& pt = point_data[ptidx[i]];
+-    zz0[i] = cam.m[2][0] * pt.xyz[0] + cam.m[2][1] * pt.xyz[1] +
+-             cam.m[2][2] * pt.xyz[2] + cam.t[2];
+-  }
+-
+-  vector<float> z2 = zz0;
+-  median_idx = ptidx.size() / 2;
+-  std::nth_element(z2.begin(), z2.begin() + median_idx, z2.end());
+-  float mz = z2[median_idx];  // median depth
+-  float dist_noise_base = mz * 0.2f;
+-
+-  /////////////////////////////////////////////////
+-  // modify points first..
+-  for (size_t i = 0; i < point_data.size(); ++i) {
+-    Point3D& pt = point_data[i];
+-    pt.xyz[0] = pt.xyz[0] - cx + dist_noise_base * random_ratio(percent);
+-    pt.xyz[1] = pt.xyz[1] - cy + dist_noise_base * random_ratio(percent);
+-    pt.xyz[2] = pt.xyz[2] - cz + dist_noise_base * random_ratio(percent);
+-  }
+-
+-  vector<bool> need_modification(camera_data.size(), true);
+-  int invalid_count = 0, modify_iteration = 1;
+-
+-  do {
+-    if (invalid_count)
+-      std::cout << "NOTE" << std::setw(2) << modify_iteration << ": modify "
+-                << invalid_count << " camera to fix visibility\n";
+-
+-    //////////////////////////////////////////////////////
+-    for (size_t i = 0; i < camera_data.size(); ++i) {
+-      if (!need_modification[i]) continue;
+-      CameraT& cam = camera_data[i];
+-      double e[3], c[3];
+-      cam = backup[i];
+-      cam.f *= random_ratio(percent);
+-
+-      ///////////////////////////////////////////////////////////
+-      cam.GetCameraCenter(c);
+-      c[0] = c[0] - cx + dist_noise_base * random_ratio(percent);
+-      c[1] = c[1] - cy + dist_noise_base * random_ratio(percent);
+-      c[2] = c[2] - cz + dist_noise_base * random_ratio(percent);
+-
+-      ///////////////////////////////////////////////////////////
+-      cam.GetRodriguesRotation(e);
+-      e[0] *= random_ratio(percent);
+-      e[1] *= random_ratio(percent);
+-      e[2] *= random_ratio(percent);
+-
+-      ///////////////////////////////////////////////////////////
+-      cam.SetRodriguesRotation(e);
+-      cam.SetCameraCenterAfterRotation(c);
+-    }
+-    vector<bool> invalidc(camera_data.size(), false);
+-
+-    invalid_count = 0;
+-    for (size_t i = 0; i < ptidx.size(); ++i) {
+-      int cid = camidx[i];
+-      if (need_modification[cid] == false) continue;
+-      if (invalidc[cid]) continue;
+-      CameraT& cam = camera_data[cid];
+-      Point3D& pt = point_data[ptidx[i]];
+-      float z = cam.m[2][0] * pt.xyz[0] + cam.m[2][1] * pt.xyz[1] +
+-                cam.m[2][2] * pt.xyz[2] + cam.t[2];
+-      if (z * zz0[i] > 0) continue;
+-      if (zz0[i] == 0 && z > 0) continue;
+-      invalid_count++;
+-      invalidc[cid] = true;
+-    }
+-
+-    need_modification = invalidc;
+-    modify_iteration++;
+-
+-  } while (invalid_count && modify_iteration < 20);
+-}
+-
+-void ExamineVisiblity(const char* input_filename) {
+-  //////////////
+-  vector<CameraD> camera_data;
+-  vector<Point3B> point_data;
+-  vector<int> ptidx, camidx;
+-  vector<Point2D> measurements;
+-  ifstream in(input_filename);
+-  LoadBundlerModel(in, camera_data, point_data, measurements, ptidx, camidx);
+-
+-  ////////////////
+-  int count = 0;
+-  double d1 = 100, d2 = 100;
+-  std::cout << "checking visibility...\n";
+-  vector<double> zz(ptidx.size());
+-  for (size_t i = 0; i < ptidx.size(); ++i) {
+-    CameraD& cam = camera_data[camidx[i]];
+-    Point3B& pt = point_data[ptidx[i]];
+-    double dz = cam.m[2][0] * pt.xyz[0] + cam.m[2][1] * pt.xyz[1] +
+-                cam.m[2][2] * pt.xyz[2] + cam.t[2];
+-    // double dx = cam.m[0][0] * pt.xyz[0] + cam.m[0][1] * pt.xyz[1] +
+-    // cam.m[0][2] * pt.xyz[2] + cam.t[0];
+-    // double dy = cam.m[1][0] * pt.xyz[0] + cam.m[1][1] * pt.xyz[1] +
+-    // cam.m[1][2] * pt.xyz[2] + cam.t[1];
+-
+-    ////////////////////////////////////////
+-    float c[3];
+-    cam.GetCameraCenter(c);
+-
+-    CameraT camt;
+-    camt.SetCameraT(cam);
+-    Point3D ptt;
+-    ptt.SetPoint(pt.xyz);
+-    double fz = camt.m[2][0] * ptt.xyz[0] + camt.m[2][1] * ptt.xyz[1] +
+-                camt.m[2][2] * ptt.xyz[2] + camt.t[2];
+-    double fz2 = camt.m[2][0] * (ptt.xyz[0] - c[0]) +
+-                 camt.m[2][1] * (ptt.xyz[1] - c[1]) +
+-                 camt.m[2][2] * (ptt.xyz[2] - c[2]);
+-
+-    // if(dz == 0 && fz == 0) continue;
+-
+-    if (dz * fz <= 0 || fz == 0) {
+-      std::cout << "cam "
+-                << camidx[i]  //<<// "; dx = " << dx << "; dy = " << dy
+-                << "; double: " << dz << "; float " << fz << "; float2 " << fz2
+-                << "\n";
+-      // std::cout << cam.m[2][0] << " "<<cam.m[2][1]<< " " <<  cam.m[2][2] << "
+-      // "<<cam.t[2] << "\n";
+-      // std::cout << camt.m[2][0] << " "<<camt.m[2][1]<< " " <<  camt.m[2][2]
+-      // << " "<<camt.t[2] << "\n";
+-      // std::cout << cam.m[2][0] - camt.m[2][0] << " " <<cam.m[2][1] -
+-      // camt.m[2][1]<< " "
+-      //          << cam.m[2][2] - camt.m[2][2] << " " <<cam.t[2] - camt.t[2]<<
+-      //          "\n";
+-    }
+-
+-    zz[i] = dz;
+-    d1 = std::min(fabs(dz), d1);
+-    d2 = std::min(fabs(fz), d2);
+-  }
+-
+-  std::cout << count << " points moved to wrong side " << d1 << ", " << d2
+-            << "\n";
+-}
+-
+-bool RemoveInvisiblePoints(vector<CameraT>& camera_data,
+-                           vector<Point3D>& point_data, vector<int>& ptidx,
+-                           vector<int>& camidx, vector<Point2D>& measurements,
+-                           vector<string>& names, vector<int>& ptc) {
+-  vector<float> zz(ptidx.size());
+-  for (size_t i = 0; i < ptidx.size(); ++i) {
+-    CameraT& cam = camera_data[camidx[i]];
+-    Point3D& pt = point_data[ptidx[i]];
+-    zz[i] = cam.m[2][0] * pt.xyz[0] + cam.m[2][1] * pt.xyz[1] +
+-            cam.m[2][2] * pt.xyz[2] + cam.t[2];
+-  }
+-  size_t median_idx = ptidx.size() / 2;
+-  std::nth_element(zz.begin(), zz.begin() + median_idx, zz.end());
+-  float dist_threshold = zz[median_idx] * 0.001f;
+-
+-  // keep removing 3D points. until all of them are infront of the cameras..
+-  vector<bool> pmask(point_data.size(), true);
+-  int points_removed = 0;
+-  for (size_t i = 0; i < ptidx.size(); ++i) {
+-    int cid = camidx[i], pid = ptidx[i];
+-    if (!pmask[pid]) continue;
+-    CameraT& cam = camera_data[cid];
+-    Point3D& pt = point_data[pid];
+-    bool visible = (cam.m[2][0] * pt.xyz[0] + cam.m[2][1] * pt.xyz[1] +
+-                        cam.m[2][2] * pt.xyz[2] + cam.t[2] >
+-                    dist_threshold);
+-    pmask[pid] = visible;  // this point should be removed
+-    if (!visible) points_removed++;
+-  }
+-  if (points_removed == 0) return false;
+-  vector<int> cv(camera_data.size(), 0);
+-  // should any cameras be removed ?
+-  int min_observation = 20;  // cameras should see at leat 20 points
+-
+-  do {
+-    // count visible points for each camera
+-    std::fill(cv.begin(), cv.end(), 0);
+-    for (size_t i = 0; i < ptidx.size(); ++i) {
+-      int cid = camidx[i], pid = ptidx[i];
+-      if (pmask[pid]) cv[cid]++;
+-    }
+-
+-    // check if any more points should be removed
+-    vector<int> pv(point_data.size(), 0);
+-    for (size_t i = 0; i < ptidx.size(); ++i) {
+-      int cid = camidx[i], pid = ptidx[i];
+-      if (!pmask[pid]) continue;  // point already removed
+-      if (cv[cid] < min_observation)  // this camera shall be removed.
+-      {
+-        ///
+-      } else {
+-        pv[pid]++;
+-      }
+-    }
+-
+-    points_removed = 0;
+-    for (size_t i = 0; i < point_data.size(); ++i) {
+-      if (pmask[i] == false) continue;
+-      if (pv[i] >= 2) continue;
+-      pmask[i] = false;
+-      points_removed++;
+-    }
+-  } while (points_removed > 0);
+-
+-  ////////////////////////////////////
+-  vector<bool> cmask(camera_data.size(), true);
+-  for (size_t i = 0; i < camera_data.size(); ++i)
+-    cmask[i] = cv[i] >= min_observation;
+-  ////////////////////////////////////////////////////////
+-
+-  vector<int> cidx(camera_data.size());
+-  vector<int> pidx(point_data.size());
+-
+-  /// modified model.
+-  vector<CameraT> camera_data2;
+-  vector<Point3D> point_data2;
+-  vector<int> ptidx2;
+-  vector<int> camidx2;
+-  vector<Point2D> measurements2;
+-  vector<string> names2;
+-  vector<int> ptc2;
+-
+-  //
+-  if (names.size() < camera_data.size())
+-    names.resize(camera_data.size(), string("unknown"));
+-  if (ptc.size() < 3 * point_data.size()) ptc.resize(point_data.size() * 3, 0);
+-
+-  //////////////////////////////
+-  int new_camera_count = 0, new_point_count = 0;
+-  for (size_t i = 0; i < camera_data.size(); ++i) {
+-    if (!cmask[i]) continue;
+-    camera_data2.push_back(camera_data[i]);
+-    names2.push_back(names[i]);
+-    cidx[i] = new_camera_count++;
+-  }
+-
+-  for (size_t i = 0; i < point_data.size(); ++i) {
+-    if (!pmask[i]) continue;
+-    point_data2.push_back(point_data[i]);
+-    ptc.push_back(ptc[i]);
+-    pidx[i] = new_point_count++;
+-  }
+-
+-  int new_observation_count = 0;
+-  for (size_t i = 0; i < ptidx.size(); ++i) {
+-    int pid = ptidx[i], cid = camidx[i];
+-    if (!pmask[pid] || !cmask[cid]) continue;
+-    ptidx2.push_back(pidx[pid]);
+-    camidx2.push_back(cidx[cid]);
+-    measurements2.push_back(measurements[i]);
+-    new_observation_count++;
+-  }
+-
+-  std::cout << "NOTE: removing " << (camera_data.size() - new_camera_count)
+-            << " cameras; " << (point_data.size() - new_point_count)
+-            << " 3D Points; " << (measurements.size() - new_observation_count)
+-            << " Observations;\n";
+-
+-  camera_data2.swap(camera_data);
+-  names2.swap(names);
+-  point_data2.swap(point_data);
+-  ptc2.swap(ptc);
+-  ptidx2.swap(ptidx);
+-  camidx2.swap(camidx);
+-  measurements2.swap(measurements);
+-
+-  return true;
+-}
+-
+-void SaveModelFile(const char* outpath, vector<CameraT>& camera_data,
+-                   vector<Point3D>& point_data, vector<Point2D>& measurements,
+-                   vector<int>& ptidx, vector<int>& camidx,
+-                   vector<string>& names, vector<int>& ptc) {
+-  if (outpath == NULL) return;
+-  if (strstr(outpath, ".nvm"))
+-    SaveNVM(outpath, camera_data, point_data, measurements, ptidx, camidx,
+-            names, ptc);
+-  else if (strstr(outpath, ".out"))
+-    SaveBundlerOut(outpath, camera_data, point_data, measurements, ptidx,
+-                   camidx, names, ptc);
+-  else
+-    SaveBundlerModel(outpath, camera_data, point_data, measurements, ptidx,
+-                     camidx);
+-}
+-
+-}  // namespace pba
+diff --git a/src/controllers/incremental_mapper.cc b/src/controllers/incremental_mapper.cc
+index 80aa0651f..1d6091fb7 100644
+--- a/src/controllers/incremental_mapper.cc
++++ b/src/controllers/incremental_mapper.cc
+@@ -63,15 +63,7 @@ void AdjustGlobalBundle(const IncrementalMapperOptions& options,
+   }
+ 
+   PrintHeading1("Global bundle adjustment");
+-  if (options.ba_global_use_pba && !options.fix_existing_images &&
+-      num_reg_images >= kMinNumRegImagesForFastBA &&
+-      ParallelBundleAdjuster::IsSupported(custom_ba_options,
+-                                          mapper->GetReconstruction())) {
+-    mapper->AdjustParallelGlobalBundle(
+-        custom_ba_options, options.ParallelGlobalBundleAdjustment());
+-  } else {
+-    mapper->AdjustGlobalBundle(options.Mapper(), custom_ba_options);
+-  }
++  mapper->AdjustGlobalBundle(options.Mapper(), custom_ba_options);
+ }
+ 
+ void IterativeLocalRefinement(const IncrementalMapperOptions& options,
+@@ -263,18 +255,6 @@ BundleAdjustmentOptions IncrementalMapperOptions::GlobalBundleAdjustment()
+   return options;
+ }
+ 
+-ParallelBundleAdjuster::Options
+-IncrementalMapperOptions::ParallelGlobalBundleAdjustment() const {
+-  ParallelBundleAdjuster::Options options;
+-  options.max_num_iterations = ba_global_max_num_iterations;
+-  options.print_summary = true;
+-  options.gpu_index = ba_global_pba_gpu_index;
+-  options.num_threads = num_threads;
+-  options.min_num_residuals_for_multi_threading =
+-      ba_min_num_residuals_for_multi_threading;
+-  return options;
+-}
+-
+ bool IncrementalMapperOptions::Check() const {
+   CHECK_OPTION_GT(min_num_matches, 0);
+   CHECK_OPTION_GT(max_num_models, 0);
+diff --git a/src/controllers/incremental_mapper.h b/src/controllers/incremental_mapper.h
+index 3686a58e0..f3731d1c8 100644
+--- a/src/controllers/incremental_mapper.h
++++ b/src/controllers/incremental_mapper.h
+@@ -99,12 +99,6 @@ struct IncrementalMapperOptions {
+   // The maximum number of local bundle adjustment iterations.
+   int ba_local_max_num_iterations = 25;
+ 
+-  // Whether to use PBA in global bundle adjustment.
+-  bool ba_global_use_pba = false;
+-
+-  // The GPU index for PBA bundle adjustment.
+-  int ba_global_pba_gpu_index = -1;
+-
+   // The growth rates after which to perform global bundle adjustment.
+   double ba_global_images_ratio = 1.1;
+   double ba_global_points_ratio = 1.1;
+@@ -140,7 +134,6 @@ struct IncrementalMapperOptions {
+   IncrementalTriangulator::Options Triangulation() const;
+   BundleAdjustmentOptions LocalBundleAdjustment() const;
+   BundleAdjustmentOptions GlobalBundleAdjustment() const;
+-  ParallelBundleAdjuster::Options ParallelGlobalBundleAdjustment() const;
+ 
+   bool Check() const;
+ 
+diff --git a/src/optim/bundle_adjustment.cc b/src/optim/bundle_adjustment.cc
+index ace191426..2def3d63c 100644
+--- a/src/optim/bundle_adjustment.cc
++++ b/src/optim/bundle_adjustment.cc
+@@ -529,259 +529,6 @@ void BundleAdjuster::ParameterizePoints(Reconstruction* reconstruction) {
+   }
+ }
+ 
+-////////////////////////////////////////////////////////////////////////////////
+-// ParallelBundleAdjuster
+-////////////////////////////////////////////////////////////////////////////////
+-
+-bool ParallelBundleAdjuster::Options::Check() const {
+-  CHECK_OPTION_GE(max_num_iterations, 0);
+-  return true;
+-}
+-
+-ParallelBundleAdjuster::ParallelBundleAdjuster(
+-    const Options& options, const BundleAdjustmentOptions& ba_options,
+-    const BundleAdjustmentConfig& config)
+-    : options_(options),
+-      ba_options_(ba_options),
+-      config_(config),
+-      num_measurements_(0) {
+-  CHECK(options_.Check());
+-  CHECK(ba_options_.Check());
+-  CHECK_EQ(config_.NumConstantCameras(), 0)
+-      << "PBA does not allow to set individual cameras constant";
+-  CHECK_EQ(config_.NumConstantPoses(), 0)
+-      << "PBA does not allow to set individual translational elements constant";
+-  CHECK_EQ(config_.NumConstantTvecs(), 0)
+-      << "PBA does not allow to set individual translational elements constant";
+-  CHECK(config_.NumVariablePoints() == 0 && config_.NumConstantPoints() == 0)
+-      << "PBA does not allow to parameterize individual 3D points";
+-}
+-
+-bool ParallelBundleAdjuster::Solve(Reconstruction* reconstruction) {
+-  CHECK_NOTNULL(reconstruction);
+-  CHECK_EQ(num_measurements_, 0)
+-      << "Cannot use the same ParallelBundleAdjuster multiple times";
+-  CHECK(!ba_options_.refine_principal_point);
+-  CHECK_EQ(ba_options_.refine_focal_length, ba_options_.refine_extra_params);
+-
+-  SetUp(reconstruction);
+-
+-  const int num_residuals = static_cast<int>(2 * measurements_.size());
+-
+-  size_t num_threads = options_.num_threads;
+-  if (num_residuals < options_.min_num_residuals_for_multi_threading) {
+-    num_threads = 1;
+-  }
+-
+-  pba::ParallelBA::DeviceT device;
+-  const int kMaxNumResidualsFloat = 100 * 1000;
+-  if (num_residuals > kMaxNumResidualsFloat) {
+-    // The threshold for using double precision is empirically chosen and
+-    // ensures that the system can be reliable solved.
+-    device = pba::ParallelBA::PBA_CPU_DOUBLE;
+-  } else {
+-    if (options_.gpu_index < 0) {
+-      device = pba::ParallelBA::PBA_CUDA_DEVICE_DEFAULT;
+-    } else {
+-      device = static_cast<pba::ParallelBA::DeviceT>(
+-          pba::ParallelBA::PBA_CUDA_DEVICE0 + options_.gpu_index);
+-    }
+-  }
+-
+-  pba::ParallelBA pba(device, num_threads);
+-
+-  pba.SetNextBundleMode(pba::ParallelBA::BUNDLE_FULL);
+-  pba.EnableRadialDistortion(pba::ParallelBA::PBA_PROJECTION_DISTORTION);
+-  pba.SetFixedIntrinsics(!ba_options_.refine_focal_length &&
+-                         !ba_options_.refine_extra_params);
+-
+-  pba::ConfigBA* pba_config = pba.GetInternalConfig();
+-  pba_config->__lm_delta_threshold /= 100.0f;
+-  pba_config->__lm_gradient_threshold /= 100.0f;
+-  pba_config->__lm_mse_threshold = 0.0f;
+-  pba_config->__cg_min_iteration = 10;
+-  pba_config->__verbose_level = 2;
+-  pba_config->__lm_max_iteration = options_.max_num_iterations;
+-
+-  pba.SetCameraData(cameras_.size(), cameras_.data());
+-  pba.SetPointData(points3D_.size(), points3D_.data());
+-  pba.SetProjection(measurements_.size(), measurements_.data(),
+-                    point3D_idxs_.data(), camera_idxs_.data());
+-
+-  Timer timer;
+-  timer.Start();
+-  pba.RunBundleAdjustment();
+-  timer.Pause();
+-
+-  // Compose Ceres solver summary from PBA options.
+-  summary_.num_residuals_reduced = num_residuals;
+-  summary_.num_effective_parameters_reduced =
+-      static_cast<int>(8 * config_.NumImages() -
+-                       2 * config_.NumConstantCameras() + 3 * points3D_.size());
+-  summary_.num_successful_steps = pba_config->GetIterationsLM() + 1;
+-  summary_.termination_type = ceres::TerminationType::USER_SUCCESS;
+-  summary_.initial_cost =
+-      pba_config->GetInitialMSE() * summary_.num_residuals_reduced / 4;
+-  summary_.final_cost =
+-      pba_config->GetFinalMSE() * summary_.num_residuals_reduced / 4;
+-  summary_.total_time_in_seconds = timer.ElapsedSeconds();
+-
+-  TearDown(reconstruction);
+-
+-  if (options_.print_summary) {
+-    PrintHeading2("Bundle adjustment report");
+-    PrintSolverSummary(summary_);
+-  }
+-
+-  return true;
+-}
+-
+-const ceres::Solver::Summary& ParallelBundleAdjuster::Summary() const {
+-  return summary_;
+-}
+-
+-bool ParallelBundleAdjuster::IsSupported(const BundleAdjustmentOptions& options,
+-                                         const Reconstruction& reconstruction) {
+-  if (options.refine_principal_point ||
+-      options.refine_focal_length != options.refine_extra_params) {
+-    return false;
+-  }
+-
+-  // Check that all cameras are SIMPLE_RADIAL and that no intrinsics are shared.
+-  std::set<camera_t> camera_ids;
+-  for (const auto& image : reconstruction.Images()) {
+-    if (image.second.IsRegistered()) {
+-      if (camera_ids.count(image.second.CameraId()) != 0 ||
+-          reconstruction.Camera(image.second.CameraId()).ModelId() !=
+-              SimpleRadialCameraModel::model_id) {
+-        return false;
+-      }
+-      camera_ids.insert(image.second.CameraId());
+-    }
+-  }
+-  return true;
+-}
+-
+-void ParallelBundleAdjuster::SetUp(Reconstruction* reconstruction) {
+-  // Important: PBA requires the track of 3D points to be stored
+-  // contiguously, i.e. the point3D_idxs_ vector contains consecutive indices.
+-  cameras_.reserve(config_.NumImages());
+-  camera_ids_.reserve(config_.NumImages());
+-  ordered_image_ids_.reserve(config_.NumImages());
+-  image_id_to_camera_idx_.reserve(config_.NumImages());
+-  AddImagesToProblem(reconstruction);
+-  AddPointsToProblem(reconstruction);
+-}
+-
+-void ParallelBundleAdjuster::TearDown(Reconstruction* reconstruction) {
+-  for (size_t i = 0; i < cameras_.size(); ++i) {
+-    const image_t image_id = ordered_image_ids_[i];
+-    const pba::CameraT& pba_camera = cameras_[i];
+-
+-    // Note: Do not use PBA's quaternion methods as they seem to lead to
+-    // numerical instability or other issues.
+-    Image& image = reconstruction->Image(image_id);
+-    Eigen::Matrix3d rotation_matrix;
+-    pba_camera.GetMatrixRotation(rotation_matrix.data());
+-    pba_camera.GetTranslation(image.Tvec().data());
+-    image.Qvec() = RotationMatrixToQuaternion(rotation_matrix.transpose());
+-
+-    Camera& camera = reconstruction->Camera(image.CameraId());
+-    camera.Params(0) = pba_camera.GetFocalLength();
+-    camera.Params(3) = pba_camera.GetProjectionDistortion();
+-  }
+-
+-  for (size_t i = 0; i < points3D_.size(); ++i) {
+-    Point3D& point3D = reconstruction->Point3D(ordered_point3D_ids_[i]);
+-    points3D_[i].GetPoint(point3D.XYZ().data());
+-  }
+-}
+-
+-void ParallelBundleAdjuster::AddImagesToProblem(
+-    Reconstruction* reconstruction) {
+-  for (const image_t image_id : config_.Images()) {
+-    const Image& image = reconstruction->Image(image_id);
+-    CHECK_EQ(camera_ids_.count(image.CameraId()), 0)
+-        << "PBA does not support shared intrinsics";
+-
+-    const Camera& camera = reconstruction->Camera(image.CameraId());
+-    CHECK_EQ(camera.ModelId(), SimpleRadialCameraModel::model_id)
+-        << "PBA only supports the SIMPLE_RADIAL camera model";
+-
+-    // Note: Do not use PBA's quaternion methods as they seem to lead to
+-    // numerical instability or other issues.
+-    const Eigen::Matrix3d rotation_matrix =
+-        QuaternionToRotationMatrix(image.Qvec()).transpose();
+-
+-    pba::CameraT pba_camera;
+-    pba_camera.SetFocalLength(camera.Params(0));
+-    pba_camera.SetProjectionDistortion(camera.Params(3));
+-    pba_camera.SetMatrixRotation(rotation_matrix.data());
+-    pba_camera.SetTranslation(image.Tvec().data());
+-
+-    CHECK(!config_.HasConstantTvec(image_id))
+-        << "PBA cannot fix partial extrinsics";
+-    if (!ba_options_.refine_extrinsics || config_.HasConstantPose(image_id)) {
+-      CHECK(config_.IsConstantCamera(image.CameraId()))
+-          << "PBA cannot fix extrinsics only";
+-      pba_camera.SetConstantCamera();
+-    } else if (config_.IsConstantCamera(image.CameraId())) {
+-      pba_camera.SetFixedIntrinsic();
+-    } else {
+-      pba_camera.SetVariableCamera();
+-    }
+-
+-    num_measurements_ += image.NumPoints3D();
+-    cameras_.push_back(pba_camera);
+-    camera_ids_.insert(image.CameraId());
+-    ordered_image_ids_.push_back(image_id);
+-    image_id_to_camera_idx_.emplace(image_id,
+-                                    static_cast<int>(cameras_.size()) - 1);
+-
+-    for (const Point2D& point2D : image.Points2D()) {
+-      if (point2D.HasPoint3D()) {
+-        point3D_ids_.insert(point2D.Point3DId());
+-      }
+-    }
+-  }
+-}
+-
+-void ParallelBundleAdjuster::AddPointsToProblem(
+-    Reconstruction* reconstruction) {
+-  points3D_.resize(point3D_ids_.size());
+-  ordered_point3D_ids_.resize(point3D_ids_.size());
+-  measurements_.resize(num_measurements_);
+-  camera_idxs_.resize(num_measurements_);
+-  point3D_idxs_.resize(num_measurements_);
+-
+-  int point3D_idx = 0;
+-  size_t measurement_idx = 0;
+-
+-  for (const auto point3D_id : point3D_ids_) {
+-    const Point3D& point3D = reconstruction->Point3D(point3D_id);
+-    points3D_[point3D_idx].SetPoint(point3D.XYZ().data());
+-    ordered_point3D_ids_[point3D_idx] = point3D_id;
+-
+-    for (const auto track_el : point3D.Track().Elements()) {
+-      if (image_id_to_camera_idx_.count(track_el.image_id) > 0) {
+-        const Image& image = reconstruction->Image(track_el.image_id);
+-        const Camera& camera = reconstruction->Camera(image.CameraId());
+-        const Point2D& point2D = image.Point2D(track_el.point2D_idx);
+-        measurements_[measurement_idx].SetPoint2D(
+-            point2D.X() - camera.Params(1), point2D.Y() - camera.Params(2));
+-        camera_idxs_[measurement_idx] =
+-            image_id_to_camera_idx_.at(track_el.image_id);
+-        point3D_idxs_[measurement_idx] = point3D_idx;
+-        measurement_idx += 1;
+-      }
+-    }
+-    point3D_idx += 1;
+-  }
+-
+-  CHECK_EQ(point3D_idx, points3D_.size());
+-  CHECK_EQ(measurement_idx, measurements_.size());
+-}
+-
+ ////////////////////////////////////////////////////////////////////////////////
+ // RigBundleAdjuster
+ ////////////////////////////////////////////////////////////////////////////////
+diff --git a/src/optim/bundle_adjustment.h b/src/optim/bundle_adjustment.h
+index 8d6282ea7..13e462090 100644
+--- a/src/optim/bundle_adjustment.h
++++ b/src/optim/bundle_adjustment.h
+@@ -39,7 +39,6 @@
+ 
+ #include <ceres/ceres.h>
+ 
+-#include "PBA/pba.h"
+ #include "base/camera_rig.h"
+ #include "base/reconstruction.h"
+ #include "util/alignment.h"
+@@ -202,71 +201,6 @@ class BundleAdjuster {
+   std::unordered_map<point3D_t, size_t> point3D_num_observations_;
+ };
+ 
+-// Bundle adjustment using PBA (GPU or CPU). Less flexible and accurate than
+-// Ceres-Solver bundle adjustment but much faster. Only supports SimpleRadial
+-// camera model.
+-class ParallelBundleAdjuster {
+- public:
+-  struct Options {
+-    // Whether to print a final summary.
+-    bool print_summary = true;
+-
+-    // Maximum number of iterations.
+-    int max_num_iterations = 50;
+-
+-    // Index of the GPU used for bundle adjustment.
+-    int gpu_index = -1;
+-
+-    // Number of threads for CPU based bundle adjustment.
+-    int num_threads = -1;
+-
+-    // Minimum number of residuals to enable multi-threading. Note that
+-    // single-threaded is typically better for small bundle adjustment problems
+-    // due to the overhead of threading.
+-    int min_num_residuals_for_multi_threading = 50000;
+-
+-    bool Check() const;
+-  };
+-
+-  ParallelBundleAdjuster(const Options& options,
+-                         const BundleAdjustmentOptions& ba_options,
+-                         const BundleAdjustmentConfig& config);
+-
+-  bool Solve(Reconstruction* reconstruction);
+-
+-  // Get the Ceres solver summary for the last call to `Solve`.
+-  const ceres::Solver::Summary& Summary() const;
+-
+-  // Check whether PBA is supported for the given reconstruction. If the
+-  // reconstruction is not supported, the PBA solver will exit ungracefully.
+-  static bool IsSupported(const BundleAdjustmentOptions& options,
+-                          const Reconstruction& reconstruction);
+-
+- private:
+-  void SetUp(Reconstruction* reconstruction);
+-  void TearDown(Reconstruction* reconstruction);
+-
+-  void AddImagesToProblem(Reconstruction* reconstruction);
+-  void AddPointsToProblem(Reconstruction* reconstruction);
+-
+-  const Options options_;
+-  const BundleAdjustmentOptions ba_options_;
+-  BundleAdjustmentConfig config_;
+-  ceres::Solver::Summary summary_;
+-
+-  size_t num_measurements_;
+-  std::vector<pba::CameraT> cameras_;
+-  std::vector<pba::Point3D> points3D_;
+-  std::vector<pba::Point2D> measurements_;
+-  std::unordered_set<camera_t> camera_ids_;
+-  std::unordered_set<point3D_t> point3D_ids_;
+-  std::vector<int> camera_idxs_;
+-  std::vector<int> point3D_idxs_;
+-  std::vector<image_t> ordered_image_ids_;
+-  std::vector<point3D_t> ordered_point3D_ids_;
+-  std::unordered_map<image_t, int> image_id_to_camera_idx_;
+-};
+-
+ class RigBundleAdjuster : public BundleAdjuster {
+  public:
+   struct Options {
+diff --git a/src/optim/bundle_adjustment_test.cc b/src/optim/bundle_adjustment_test.cc
+index 1d8ba0e6f..f1c4d6bee 100644
+--- a/src/optim/bundle_adjustment_test.cc
++++ b/src/optim/bundle_adjustment_test.cc
+@@ -644,114 +644,6 @@ BOOST_AUTO_TEST_CASE(TestConstantExtraParam) {
+   }
+ }
+ 
+-BOOST_AUTO_TEST_CASE(TestParallelReconstructionSupported) {
+-  BundleAdjustmentOptions options;
+-  options.refine_focal_length = true;
+-  options.refine_principal_point = false;
+-  options.refine_extra_params = true;
+-  Reconstruction reconstruction;
+-  CorrespondenceGraph correspondence_graph;
+-  GenerateReconstruction(2, 100, &reconstruction, &correspondence_graph);
+-  BOOST_CHECK(ParallelBundleAdjuster::IsSupported(options, reconstruction));
+-
+-  reconstruction.Camera(0).SetModelIdFromName("SIMPLE_PINHOLE");
+-  BOOST_CHECK(!ParallelBundleAdjuster::IsSupported(options, reconstruction));
+-
+-  reconstruction.Camera(0).SetModelIdFromName("SIMPLE_RADIAL");
+-  BOOST_CHECK(ParallelBundleAdjuster::IsSupported(options, reconstruction));
+-
+-  options.refine_principal_point = true;
+-  BOOST_CHECK(!ParallelBundleAdjuster::IsSupported(options, reconstruction));
+-  options.refine_principal_point = false;
+-
+-  options.refine_focal_length = false;
+-  BOOST_CHECK(!ParallelBundleAdjuster::IsSupported(options, reconstruction));
+-
+-  options.refine_extra_params = false;
+-  BOOST_CHECK(ParallelBundleAdjuster::IsSupported(options, reconstruction));
+-
+-  options.refine_focal_length = true;
+-  BOOST_CHECK(!ParallelBundleAdjuster::IsSupported(options, reconstruction));
+-}
+-
+-BOOST_AUTO_TEST_CASE(TestParallelTwoViewVariableIntrinsics) {
+-  Reconstruction reconstruction;
+-  CorrespondenceGraph correspondence_graph;
+-  GenerateReconstruction(2, 100, &reconstruction, &correspondence_graph);
+-  const auto orig_reconstruction = reconstruction;
+-
+-  BundleAdjustmentConfig config;
+-  config.AddImage(0);
+-  config.AddImage(1);
+-
+-  ParallelBundleAdjuster::Options options;
+-  BundleAdjustmentOptions ba_options;
+-  ba_options.refine_focal_length = true;
+-  ba_options.refine_principal_point = false;
+-  ba_options.refine_extra_params = true;
+-  ParallelBundleAdjuster bundle_adjuster(options, ba_options, config);
+-  BOOST_REQUIRE(bundle_adjuster.Solve(&reconstruction));
+-
+-  const auto summary = bundle_adjuster.Summary();
+-
+-  // 100 points, 2 images, 2 residuals per point per image
+-  BOOST_CHECK_EQUAL(summary.num_residuals_reduced, 400);
+-  // 100 x 3 point parameters
+-  // + 12 image parameters
+-  // + 2 x 2 camera parameters
+-  BOOST_CHECK_EQUAL(summary.num_effective_parameters_reduced, 316);
+-
+-  CheckVariableCamera(reconstruction.Camera(0), orig_reconstruction.Camera(0));
+-  CheckVariableImage(reconstruction.Image(0), orig_reconstruction.Image(0));
+-
+-  CheckVariableCamera(reconstruction.Camera(1), orig_reconstruction.Camera(1));
+-  CheckVariableImage(reconstruction.Image(1), orig_reconstruction.Image(1));
+-
+-  for (const auto& point3D : reconstruction.Points3D()) {
+-    CheckVariablePoint(point3D.second,
+-                       orig_reconstruction.Point3D(point3D.first));
+-  }
+-}
+-
+-BOOST_AUTO_TEST_CASE(TestParallelTwoViewConstantIntrinsics) {
+-  Reconstruction reconstruction;
+-  CorrespondenceGraph correspondence_graph;
+-  GenerateReconstruction(2, 100, &reconstruction, &correspondence_graph);
+-  const auto orig_reconstruction = reconstruction;
+-
+-  BundleAdjustmentConfig config;
+-  config.AddImage(0);
+-  config.AddImage(1);
+-
+-  ParallelBundleAdjuster::Options options;
+-  BundleAdjustmentOptions ba_options;
+-  ba_options.refine_focal_length = false;
+-  ba_options.refine_principal_point = false;
+-  ba_options.refine_extra_params = false;
+-  ParallelBundleAdjuster bundle_adjuster(options, ba_options, config);
+-  BOOST_REQUIRE(bundle_adjuster.Solve(&reconstruction));
+-
+-  const auto summary = bundle_adjuster.Summary();
+-
+-  // 100 points, 2 images, 2 residuals per point per image
+-  BOOST_CHECK_EQUAL(summary.num_residuals_reduced, 400);
+-  // 100 x 3 point parameters
+-  // + 12 image parameters
+-  // + 2 x 2 camera parameters
+-  BOOST_CHECK_EQUAL(summary.num_effective_parameters_reduced, 316);
+-
+-  CheckConstantCamera(reconstruction.Camera(0), orig_reconstruction.Camera(0));
+-  CheckVariableImage(reconstruction.Image(0), orig_reconstruction.Image(0));
+-
+-  CheckConstantCamera(reconstruction.Camera(1), orig_reconstruction.Camera(1));
+-  CheckVariableImage(reconstruction.Image(1), orig_reconstruction.Image(1));
+-
+-  for (const auto& point3D : reconstruction.Points3D()) {
+-    CheckVariablePoint(point3D.second,
+-                       orig_reconstruction.Point3D(point3D.first));
+-  }
+-}
+-
+ BOOST_AUTO_TEST_CASE(TestRigTwoView) {
+   Reconstruction reconstruction;
+   CorrespondenceGraph correspondence_graph;
+diff --git a/src/sfm/incremental_mapper.cc b/src/sfm/incremental_mapper.cc
+index 33bd82305..c1362d0f0 100644
+--- a/src/sfm/incremental_mapper.cc
++++ b/src/sfm/incremental_mapper.cc
+@@ -713,39 +713,6 @@ bool IncrementalMapper::AdjustGlobalBundle(
+   return true;
+ }
+ 
+-bool IncrementalMapper::AdjustParallelGlobalBundle(
+-    const BundleAdjustmentOptions& ba_options,
+-    const ParallelBundleAdjuster::Options& parallel_ba_options) {
+-  CHECK_NOTNULL(reconstruction_);
+-
+-  const std::vector<image_t>& reg_image_ids = reconstruction_->RegImageIds();
+-
+-  CHECK_GE(reg_image_ids.size(), 2)
+-      << "At least two images must be registered for global bundle-adjustment";
+-
+-  // Avoid degeneracies in bundle adjustment.
+-  reconstruction_->FilterObservationsWithNegativeDepth();
+-
+-  // Configure bundle adjustment.
+-  BundleAdjustmentConfig ba_config;
+-  for (const image_t image_id : reg_image_ids) {
+-    ba_config.AddImage(image_id);
+-  }
+-
+-  // Run bundle adjustment.
+-  ParallelBundleAdjuster bundle_adjuster(parallel_ba_options, ba_options,
+-                                         ba_config);
+-  if (!bundle_adjuster.Solve(reconstruction_)) {
+-    return false;
+-  }
+-
+-  // Normalize scene for numerical stability and
+-  // to avoid large scale changes in viewer.
+-  reconstruction_->Normalize();
+-
+-  return true;
+-}
+-
+ size_t IncrementalMapper::FilterImages(const Options& options) {
+   CHECK_NOTNULL(reconstruction_);
+   CHECK(options.Check());
+diff --git a/src/sfm/incremental_mapper.h b/src/sfm/incremental_mapper.h
+index 859194f14..5dd6fc549 100644
+--- a/src/sfm/incremental_mapper.h
++++ b/src/sfm/incremental_mapper.h
+@@ -206,12 +206,9 @@ class IncrementalMapper {
+       const IncrementalTriangulator::Options& tri_options,
+       const image_t image_id, const std::unordered_set<point3D_t>& point3D_ids);
+ 
+-  // Global bundle adjustment using Ceres Solver or PBA.
++  // Global bundle adjustment using Ceres Solver.
+   bool AdjustGlobalBundle(const Options& options,
+                           const BundleAdjustmentOptions& ba_options);
+-  bool AdjustParallelGlobalBundle(
+-      const BundleAdjustmentOptions& ba_options,
+-      const ParallelBundleAdjuster::Options& parallel_ba_options);
+ 
+   // Filter images and point observations.
+   size_t FilterImages(const Options& options);
+diff --git a/src/ui/license_widget.cc b/src/ui/license_widget.cc
+index d1cedf667..def4cc8bb 100644
+--- a/src/ui/license_widget.cc
++++ b/src/ui/license_widget.cc
+@@ -45,8 +45,6 @@ LicenseWidget::LicenseWidget(QWidget* parent) : QTextEdit(parent) {
+   licenses += "<h2>External</h2>";
+   licenses += "<h3>LSD</h3>";
+   licenses += GetLSDLicense();
+-  licenses += "<h3>PBA</h3>";
+-  licenses += GetPBALicense();
+   licenses += "<h3>PoissonRecon</h3>";
+   licenses += GetPoissonReconLicense();
+   licenses += "<h3>SiftGPU</h3>";
+@@ -137,23 +135,6 @@ QString LicenseWidget::GetLSDLicense() const {
+   return license;
+ }
+ 
+-QString LicenseWidget::GetPBALicense() const {
+-  const QString license =
+-      "Copyright (c) 2011  Changchang Wu (ccwu@cs.washington.edu)<br>"
+-      "and the University of Washington at Seattle<br>"
+-      "<br>"
+-      "This library is free software; you can redistribute it and/or<br>"
+-      "modify it under the terms of the GNU General Public<br>"
+-      "License as published by the Free Software Foundation; either<br>"
+-      "Version 3 of the License, or (at your option) any later version.<br>"
+-      "<br>"
+-      "This library is distributed in the hope that it will be useful,<br>"
+-      "but WITHOUT ANY WARRANTY; without even the implied warranty of<br>"
+-      "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU<br>"
+-      "General Public License for more details.";
+-  return license;
+-}
+-
+ QString LicenseWidget::GetPoissonReconLicense() const {
+   const QString license =
+       "The MIT License (MIT)<br>"
+diff --git a/src/ui/license_widget.h b/src/ui/license_widget.h
+index 0a43902b3..e6eca4da0 100644
+--- a/src/ui/license_widget.h
++++ b/src/ui/license_widget.h
+@@ -43,7 +43,6 @@ class LicenseWidget : public QTextEdit {
+  private:
+   QString GetCOLMAPLicense() const;
+   QString GetLSDLicense() const;
+-  QString GetPBALicense() const;
+   QString GetPoissonReconLicense() const;
+   QString GetSiftGPULicense() const;
+   QString GetSQLiteLicense() const;
+diff --git a/src/ui/reconstruction_options_widget.cc b/src/ui/reconstruction_options_widget.cc
+index b9b79f8a3..3f80846a1 100644
+--- a/src/ui/reconstruction_options_widget.cc
++++ b/src/ui/reconstruction_options_widget.cc
+@@ -130,15 +130,12 @@ MapperBundleAdjustmentOptionsWidget::MapperBundleAdjustmentOptionsWidget(
+   AddSpacer();
+ 
+   AddSection("Global Bundle Adjustment");
+-  AddOptionBool(&options->mapper->ba_global_use_pba,
+-                "use_pba\n(requires SIMPLE_RADIAL)");
+   AddOptionDouble(&options->mapper->ba_global_images_ratio, "images_ratio");
+   AddOptionInt(&options->mapper->ba_global_images_freq, "images_freq");
+   AddOptionDouble(&options->mapper->ba_global_points_ratio, "points_ratio");
+   AddOptionInt(&options->mapper->ba_global_points_freq, "points_freq");
+   AddOptionInt(&options->mapper->ba_global_max_num_iterations,
+                "max_num_iterations");
+-  AddOptionInt(&options->mapper->ba_global_pba_gpu_index, "pba_gpu_index", -1);
+   AddOptionInt(&options->mapper->ba_global_max_refinements, "max_refinements",
+                1);
+   AddOptionDouble(&options->mapper->ba_global_max_refinement_change,
+diff --git a/src/util/option_manager.cc b/src/util/option_manager.cc
+index e31105490..f620078af 100644
+--- a/src/util/option_manager.cc
++++ b/src/util/option_manager.cc
+@@ -514,10 +514,6 @@ void OptionManager::AddMapperOptions() {
+                               &mapper->ba_local_function_tolerance);
+   AddAndRegisterDefaultOption("Mapper.ba_local_max_num_iterations",
+                               &mapper->ba_local_max_num_iterations);
+-  AddAndRegisterDefaultOption("Mapper.ba_global_use_pba",
+-                              &mapper->ba_global_use_pba);
+-  AddAndRegisterDefaultOption("Mapper.ba_global_pba_gpu_index",
+-                              &mapper->ba_global_pba_gpu_index);
+   AddAndRegisterDefaultOption("Mapper.ba_global_images_ratio",
+                               &mapper->ba_global_images_ratio);
+   AddAndRegisterDefaultOption("Mapper.ba_global_points_ratio",
diff --git a/recipe/1841.patch b/recipe/1841.patch
new file mode 100644
index 0000000..8cd9c17
--- /dev/null
+++ b/recipe/1841.patch
@@ -0,0 +1,40 @@
+From 9bbf3e688996ad05c0faf8c7345a77d7be4c3263 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Johannes=20Sch=C3=B6nberger?= <joschonb@microsoft.com>
+Date: Sun, 12 Mar 2023 15:53:59 +0100
+Subject: [PATCH] Replace deprecated CUDA sature function call
+
+---
+ lib/SiftGPU/ProgramCU.cu | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/lib/SiftGPU/ProgramCU.cu b/lib/SiftGPU/ProgramCU.cu
+index 0b99ad066..700c26b0f 100644
+--- a/lib/SiftGPU/ProgramCU.cu
++++ b/lib/SiftGPU/ProgramCU.cu
+@@ -1245,7 +1245,7 @@ void __global__ ConvertDOG_Kernel(cudaTextureObject_t texData, float* d_result,
+ 		int index = row * width  + col;
+ 		float v = tex1Dfetch<float>(texData, index);
+ 		d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)?
+-			0.5 : saturate(0.5+20.0*v);
++			0.5 : __saturatef(0.5+20.0*v);
+ 	}
+ }
+ ///
+@@ -1269,7 +1269,7 @@ void __global__ ConvertGRD_Kernel(cudaTextureObject_t texData, float* d_result,
+ 		int index = row * width  + col;
+ 		float v = tex1Dfetch<float>(texData, index << 1);
+ 		d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)?
+-				0 : saturate(5 * v);
++				0 : __saturatef(5 * v);
+ 
+ 	}
+ }
+@@ -1297,7 +1297,7 @@ void __global__ ConvertKEY_Kernel(cudaTextureObject_t texData, cudaTextureObject
+ 		float4 keyv = tex1Dfetch<float4>(texDataF4, index);
+ 		int is_key = (keyv.x == 1.0f || keyv.x == -1.0f);
+ 		int inside = col > 0 && row > 0 && row < height -1 && col < width - 1;
+-		float v = inside? saturate(0.5 + 20 * tex1Dfetch<float>(texData, index)) : 0.5;
++		float v = inside? __saturatef(0.5 + 20 * tex1Dfetch<float>(texData, index)) : 0.5;
+ 		d_result[index] = is_key && inside ?
+ 			(keyv.x > 0? make_float4(1.0f, 0, 0, 1.0f) : make_float4(0.0f, 1.0f, 0.0f, 1.0f)):
+ 			make_float4(v, v, v, 1.0f) ;
diff --git a/recipe/meta.yaml b/recipe/meta.yaml
index e406d2e..34d823f 100644
--- a/recipe/meta.yaml
+++ b/recipe/meta.yaml
@@ -1,6 +1,6 @@
 {% set name = "colmap" %}
 {% set version = "3.8" %}
-{% set build = 2 %}
+{% set build = 3 %}
 
 {% set processor = "cpu" if cuda_compiler_version == "None" else "gpu" %}  # [not osx]
 {% set processor = "cpu" %}  # [osx]
@@ -19,6 +19,11 @@ source:
       - fix_build.patch
       - unvendor_vlfeat.patch  # [unix]
       - fix_find_lz4.patch
+      - 1809.patch  # [cuda_compiler_version == "12.0"]
+      - 1840.patch  # [cuda_compiler_version == "12.0"]
+      - 1823.patch  # [cuda_compiler_version == "12.0"]
+      - 1838.patch  # [cuda_compiler_version == "12.0"]
+      - 1841.patch  # [cuda_compiler_version == "12.0"]
 
 build:
   number: {{ build }}
@@ -82,6 +87,7 @@ requirements:
     - gmp                            # [unix]
     - lz4-c
     - metis
+    - libcurand-dev                  # [cuda_compiler_version == "12.0"]
   run:
     - boost-cpp
     - vlfeat                         # [unix]
@@ -112,7 +118,7 @@ about:
   license_file:
     - COPYING.txt
     - lib/LSD/LICENSE
-    - lib/PBA/LICENSE
+    - lib/PBA/LICENSE  # [cuda_compiler_version != "12.0"]
     - lib/PoissonRecon/LICENSE
     - lib/SiftGPU/LICENSE
   summary: COLMAP is a general-purpose Structure-from-Motion (SfM) and Multi-View Stereo (MVS) pipeline with a graphical and command-line interface.