diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile index ca2cf41669f..e181387d719 100644 --- a/src/cudamatrix/Makefile +++ b/src/cudamatrix/Makefile @@ -16,7 +16,8 @@ TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test OBJFILES = cu-device.o cu-math.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \ - cu-vector.o cu-common.o cu-tp-matrix.o cu-rand.o cu-block-matrix.o + cu-vector.o cu-common.o cu-tp-matrix.o cu-rand.o cu-block-matrix.o \ + cu-sparse-matrix.o ifeq ($(CUDA), true) OBJFILES += cu-kernels.o cu-randkernels.o endif diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 650867a8172..a0761e37de7 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -303,6 +303,15 @@ void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out, const float* void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in); void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in); +void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in); +void cuda_copy_from_smat_fd(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in); +void cuda_copy_from_smat_df(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in); +void cuda_copy_from_smat_dd(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in); +void cuda_copy_from_smat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in); +void cuda_copy_from_smat_fd_trans(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in); +void cuda_copy_from_smat_df_trans(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in); +void cuda_copy_from_smat_dd_trans(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in); + void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, double alpha, MatrixElement* x, int s); void cudaD_comp_obj_deriv(dim3 Gr,dim3 Bl, MatrixElement* x, int s, const double* z, MatrixDim d, double* z2, MatrixDim d2, double* t); diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index d5aa282f21a..82680a9be67 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -254,6 +254,24 @@ static void _copy_from_mat_trans(Real* mat_out, const OtherReal* mat_in, MatrixD mat_out[index_out] = static_cast(mat_in[index_in]); } +template +__global__ +static void _copy_from_smat(Real* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) { + int smat_index = blockIdx.x * blockDim.x + threadIdx.x; + if (smat_index >= d_in) return; + int data_index = smat_in[smat_index].row * d_out.stride + smat_in[smat_index].column; + mat_out[data_index] = smat_in[smat_index].weight; +} + +template +__global__ +static void _copy_from_smat_trans(Real* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) { + int smat_index = blockIdx.x * blockDim.x + threadIdx.x; + if (smat_index >= d_in) return; + int data_index = smat_in[smat_index].column * d_out.stride + smat_in[smat_index].row; + mat_out[data_index] = smat_in[smat_index].weight; +} + template __global__ static void _transpose_matrix(Real* mat, MatrixDim d) { @@ -2907,3 +2925,27 @@ void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out, const double _copy_from_mat_trans<<>>(mat_out,mat_in,d_out,d_in); } +void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) { + _copy_from_smat<<>>(mat_out, smat_in, d_out, d_in); +} +void cuda_copy_from_smat_fd(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) { + _copy_from_smat<<>>(mat_out, smat_in, d_out, d_in); +} +void cuda_copy_from_smat_df(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) { + _copy_from_smat<<>>(mat_out, smat_in, d_out, d_in); +} +void cuda_copy_from_smat_dd(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) { + _copy_from_smat<<>>(mat_out, smat_in, d_out, d_in); +} +void cuda_copy_from_smat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) { + _copy_from_smat_trans<<>>(mat_out, smat_in, d_out, d_in); +} +void cuda_copy_from_smat_fd_trans(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) { + _copy_from_smat_trans<<>>(mat_out, smat_in, d_out, d_in); +} +void cuda_copy_from_smat_df_trans(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) { + _copy_from_smat_trans<<>>(mat_out, smat_in, d_out, d_in); +} +void cuda_copy_from_smat_dd_trans(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) { + _copy_from_smat_trans<<>>(mat_out, smat_in, d_out, d_in); +} diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 7b3d9f4eaf1..8e9f3590c86 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -82,6 +82,32 @@ inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out, const fl cuda_copy_from_mat_df_trans(Gr, Bl, mat_out, mat_in, d_out, d_in); } +inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) { + cuda_copy_from_smat_ff(Gr, Bl, mat_out, smat_in, d_out, d_in); +} +inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) { + cuda_copy_from_smat_fd(Gr, Bl, mat_out, smat_in, d_out, d_in); +} +inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) { + cuda_copy_from_smat_df(Gr, Bl, mat_out, smat_in, d_out, d_in); +} +inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) { + cuda_copy_from_smat_dd(Gr, Bl, mat_out, smat_in, d_out, d_in); +} + +inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) { + cuda_copy_from_smat_ff_trans(Gr, Bl, mat_out, smat_in, d_out, d_in); +} +inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) { + cuda_copy_from_smat_fd_trans(Gr, Bl, mat_out, smat_in, d_out, d_in); +} +inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) { + cuda_copy_from_smat_df_trans(Gr, Bl, mat_out, smat_in, d_out, d_in); +} +inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) { + cuda_copy_from_smat_dd_trans(Gr, Bl, mat_out, smat_in, d_out, d_in); +} + inline void cuda_copy_col_from_vec(int Gr, int Bl, float* mat, const float* v, int col, MatrixDim d) { cudaF_copy_col_from_vec(Gr,Bl,mat,v,col,d); } inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_apply_exp(Gr,Bl,mat,d); } inline void cuda_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim dim) { cudaF_apply_pow(Gr,Bl,mat,power,dim); } diff --git a/src/cudamatrix/cu-matrix-lib.h b/src/cudamatrix/cu-matrix-lib.h index 8efab93a42f..ef21a2945f1 100644 --- a/src/cudamatrix/cu-matrix-lib.h +++ b/src/cudamatrix/cu-matrix-lib.h @@ -26,6 +26,7 @@ #include "cudamatrix/cu-matrix.h" #include "cudamatrix/cu-sp-matrix.h" #include "cudamatrix/cu-tp-matrix.h" +#include "cudamatrix/cu-sparse-matrix.h" #include "cudamatrix/cu-block-matrix.h" #include "cudamatrix/cu-rand.h" diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 170a43f8ea0..9c207103e85 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -39,7 +39,6 @@ #include "cudamatrix/cu-sp-matrix.h" #include "cudamatrix/cu-tp-matrix.h" #include "cudamatrix/cu-block-matrix.h" -#include "cudamatrix/cu-sparse-matrix.h" #include "cudamatrix/cublas-wrappers.h" namespace kaldi { @@ -255,6 +254,53 @@ template void CuMatrixBase::CopyFromMat(const CuMatrixBase &M, MatrixTransposeType Trans); +template +template +void CuMatrixBase::CopyFromSmat(const CuSparseMatrix &M, + MatrixTransposeType trans) { + // Sanity check. + if (trans == kNoTrans) { + KALDI_ASSERT(M.NumRows() == num_rows_ && M.NumCols() == num_cols_); + } else { + KALDI_ASSERT(M.NumCols() == num_rows_ && M.NumRows() == num_cols_); + } +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + dim3 dimBlock(CU1DBLOCK, 1); + dim3 dimGrid(n_blocks(M.NumElements(), CU1DBLOCK), 1); + if (trans == kNoTrans) { + cuda_copy_from_smat(dimGrid, dimBlock, this->data_, + M.Data(), this->Dim(), M.NumElements()); + } else { + cuda_copy_from_smat_trans(dimGrid, dimBlock, this->data_, + M.Data(), this->Dim(), M.NumElements()); + } + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else +#endif + { + Mat().CopyFromSmat(M.Mat(), trans); + } +} + +// Instantiate the template above. +template +void CuMatrixBase::CopyFromSmat(const CuSparseMatrix &M, + MatrixTransposeType trans); + +template +void CuMatrixBase::CopyFromSmat(const CuSparseMatrix &M, + MatrixTransposeType trans); + +template +void CuMatrixBase::CopyFromSmat(const CuSparseMatrix &M, + MatrixTransposeType trans); + +template +void CuMatrixBase::CopyFromSmat(const CuSparseMatrix &M, + MatrixTransposeType trans); + template template void CuMatrixBase::CopyFromTp(const CuTpMatrix &M, @@ -2240,9 +2286,6 @@ void CuMatrixBase::CopyFromGeneralMat(const GeneralMatrix &src, return; } #endif - Matrix mat(trans == kNoTrans ? smat.NumRows() : smat.NumCols(), - trans == kNoTrans ? smat.NumCols() : smat.NumRows(), - kUndefined); Mat().CopyFromSmat(smat, trans); return; } diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index 18ec1e04c5a..3fa2d35fd9e 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -31,6 +31,7 @@ #include "cudamatrix/cu-matrixdim.h" #include "cudamatrix/cu-common.h" #include "cudamatrix/cu-value.h" +#include "cudamatrix/cu-sparse-matrix.h" #include "matrix/matrix-common.h" #include "matrix/kaldi-matrix.h" #include "matrix/sparse-matrix.h" diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index 60b96c3b43c..f6c465bde10 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -1,11 +1,6 @@ -// cudamatrix/cu-matrix.cc +// cudamatrix/cu-sparse-matrix.cc -// Copyright 2009-2012 Karel Vesely, Lucas Ondel -// 2013 Ehsan Variani -// 2013 Johns Hopkins University (author: Daniel Povey) -// 2013 Hainan Xu -// 2013 Xiaohui Zhang -// 2013-2015 Guoguo Chen +// Copyright 2015 Guoguo Chen // See ../../COPYING for clarification regarding multiple authors // @@ -36,2471 +31,139 @@ #include "cudamatrix/cu-randkernels.h" #include "cudamatrix/cu-array.h" #include "cudamatrix/cu-math.h" -#include "cudamatrix/cu-sp-matrix.h" -#include "cudamatrix/cu-tp-matrix.h" -#include "cudamatrix/cu-block-matrix.h" +#include "cudamatrix/cu-sparse-matrix.h" #include "cudamatrix/cublas-wrappers.h" namespace kaldi { -template -void CuMatrix::Resize(MatrixIndexT rows, MatrixIndexT cols, - MatrixResizeType resize_type) { - // This code does not currently support the other resize_type options. - KALDI_ASSERT(resize_type == kSetZero || resize_type == kUndefined); - if (rows * cols == 0) KALDI_ASSERT(rows == 0 && cols == 0); - if (this->num_rows_ == rows && this->num_cols_ == cols) { - if (resize_type == kSetZero) this->SetZero(); - return; - } - - if (this->num_rows_ != 0) - this->Destroy(); - if (rows == 0) return; -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - MatrixIndexT row_bytes = cols * sizeof(Real); - size_t pitch; - this->data_ = static_cast(CuDevice::Instantiate().MallocPitch( - row_bytes, rows, &pitch)); - this->num_rows_ = rows; - this->num_cols_ = cols; - this->stride_ = pitch / sizeof(Real); - if (resize_type == kSetZero) this->SetZero(); - CuDevice::Instantiate().AccuProfile("CuMatrix::Resize", tim.Elapsed()); - } else -#endif - { // Let the initializer of Matrix handle the allocation, - // and then just do Swap which will switch the pointers. - // This wastes a few instructions but is simple to code. - Matrix mat(rows, cols, resize_type); - this->Swap(&mat); - } -} - -template -void CuMatrix::Destroy() { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - if (this->data_ != NULL) { - Timer tim; - CuDevice::Instantiate().Free(this->data_); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } - } else -#endif - { - if (this->data_ != NULL) KALDI_MEMALIGN_FREE(this->data_); - } - this->data_ = NULL; - this->num_rows_ = 0; - this->num_cols_ = 0; - this->stride_ = 0; -} - -template -void CuMatrix::Swap(CuMatrix *mat) { - std::swap(mat->data_, this->data_); - std::swap(mat->num_cols_, this->num_cols_); - std::swap(mat->num_rows_, this->num_rows_); - std::swap(mat->stride_, this->stride_); -} - - -template -void CuMatrix::Swap(Matrix *mat) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - if (this->num_rows_ == 0) { - if (mat->num_rows_ != 0) { - // *this is empty, but mat is nonempty. - this->Resize(mat->num_rows_, mat->num_cols_, kUndefined); - this->CopyFromMat(*mat); - mat->Resize(0, 0); - } - // else both are empty. - } else { // *this is nonempty. - if (mat->num_rows_ != 0) { - // Both *this and *mat are nonempty. Recurse to simpler cases. - // this could be done more efficiently in the case where - // the size does not change. - Matrix temp; - this->Swap(&temp); // now temp is full, *this is empty. - mat->Swap(&temp); // now mat has data from *this, temp has - // data from mat. - this->Swap(&temp); // copy data in mat to *this, which is now empty. - } else { // *this is full but *mat is empty. - mat->Resize(this->num_rows_, this->num_cols_, kUndefined); - this->CopyToMat(mat); - this->Destroy(); - } - } - } else -#endif - { - std::swap(mat->data_, this->data_); - std::swap(mat->num_cols_, this->num_cols_); - std::swap(mat->num_rows_, this->num_rows_); - std::swap(mat->stride_, this->stride_); - } -} - -template -void CuMatrixBase::CopyFromBlock(const CuBlockMatrix &B, - MatrixTransposeType trans) { - this->SetZero(); - if (trans == kNoTrans) { - KALDI_ASSERT(NumRows() == B.NumRows() && NumCols() == B.NumCols()); - int32 row_offset = 0, col_offset = 0; - for (int32 b = 0; b < B.NumBlocks(); b++) { - const CuMatrixBase &block = B.Block(b); - int32 num_rows = block.NumRows(), num_cols = block.NumCols(); - CuSubMatrix this_block(*this, row_offset, num_rows, - col_offset, num_cols); - this_block.CopyFromMat(block); - row_offset += num_rows; - col_offset += num_cols; - } - KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols()); - } else { - KALDI_ASSERT(NumRows() == B.NumCols() && NumCols() == B.NumRows()); - int32 row_offset = 0, col_offset = 0; - for (int32 b = 0; b < B.NumBlocks(); b++) { - const CuMatrixBase &block = B.Block(b); - int32 num_rows = block.NumCols(), num_cols = block.NumRows(); - CuSubMatrix this_block(*this, row_offset, num_rows, - col_offset, num_cols); - this_block.CopyFromMat(block, kTrans); - row_offset += num_rows; - col_offset += num_cols; - } - KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols()); - } -} - - -template - CuMatrix::CuMatrix(const CuBlockMatrix &B, - MatrixTransposeType trans): CuMatrixBase() { - if (trans == kNoTrans) { - Resize(B.NumRows(), B.NumCols(), kUndefined); - this->CopyFromBlock(B); - } else { - Resize(B.NumCols(), B.NumRows(), kUndefined); - this->CopyFromBlock(B, kTrans); - } -} - -template -template -void CuMatrixBase::CopyFromMat(const CuMatrixBase &M, - MatrixTransposeType Trans) { - if (sizeof(Real) == sizeof(OtherReal) && - static_cast(M.Data()) == - static_cast(this->Data())) { - if (M.Data() == NULL) - return; - // CopyFromMat called on same data. Nothing to do (except sanity checks) - KALDI_ASSERT(Trans == kNoTrans && M.NumRows() == NumRows() && - M.NumCols() == NumCols() && M.Stride() == Stride()); - return; - } -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - if (Trans == kNoTrans) { - KALDI_ASSERT(M.NumRows() == num_rows_ && M.NumCols() == num_cols_); - } else { - KALDI_ASSERT(M.NumCols() == num_rows_ && M.NumRows() == num_cols_); - } - if (M.num_rows_ == 0) return; // Nothing to do. - Timer tim; - if (sizeof(Real) == sizeof(OtherReal) && Trans == kNoTrans ) { - MatrixIndexT dst_pitch = stride_ * sizeof(Real); - MatrixIndexT src_pitch = M.Stride() * sizeof(Real); - MatrixIndexT width = M.NumCols() * sizeof(Real); - CU_SAFE_CALL(cudaMemcpy2D(data_, dst_pitch, M.data_, src_pitch, - width, M.num_rows_, cudaMemcpyDeviceToDevice)); - } else { - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - // We are making this kernel "newer-style, with x corresponding to - // row dimension and y to column dimension. - dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK), n_blocks(num_cols_, CU2DBLOCK)); - if (Trans == kNoTrans) { - cuda_copy_from_mat(dimGrid, dimBlock, data_, M.data_, Dim(), M.Dim()); - } else { - cuda_copy_from_mat_trans(dimGrid, dimBlock, data_, M.data_, Dim(), M.Dim()); - } - } - CuDevice::Instantiate().AccuProfile("CuMatrixBase::CopyFromMat(from other CuMatrixBase)", tim.Elapsed()); - } else -#endif - { - Mat().CopyFromMat(M.Mat(), Trans); - } -} - -// Instantiate the template above. -template -void CuMatrixBase::CopyFromMat(const CuMatrixBase &M, - MatrixTransposeType Trans); -template -void CuMatrixBase::CopyFromMat(const CuMatrixBase &M, - MatrixTransposeType Trans); -template -void CuMatrixBase::CopyFromMat(const CuMatrixBase &M, - MatrixTransposeType Trans); -template -void CuMatrixBase::CopyFromMat(const CuMatrixBase &M, - MatrixTransposeType Trans); - -template -template -void CuMatrixBase::CopyFromTp(const CuTpMatrix &M, - MatrixTransposeType Trans) { - KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_); - if (num_rows_ == 0) - return; -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK), - n_blocks(num_rows_, CU2DBLOCK)); - if (Trans == kNoTrans) { - cuda_copy_from_tp(dimGrid, dimBlock, data_, M.Data(), Dim()); - } else { - cuda_copy_from_tp_trans(dimGrid, dimBlock, data_, M.Data(), Dim()); - } - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().CopyFromTp(M.Mat(), Trans); - } -} -// instantiate the template above. -template void CuMatrixBase::CopyFromTp(const CuTpMatrix &M, - MatrixTransposeType Trans); -template void CuMatrixBase::CopyFromTp(const CuTpMatrix &M, - MatrixTransposeType Trans); -template void CuMatrixBase::CopyFromTp(const CuTpMatrix &M, - MatrixTransposeType Trans); -template void CuMatrixBase::CopyFromTp(const CuTpMatrix &M, - MatrixTransposeType Trans); - -template -void CuMatrixBase::CopyFromMat(const MatrixBase &src, - MatrixTransposeType trans) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - if (trans == kNoTrans) { - KALDI_ASSERT(src.NumRows() == num_rows_ && src.NumCols() == num_cols_); - Timer tim; - - MatrixIndexT dst_pitch = stride_*sizeof(Real); - MatrixIndexT src_pitch = src.Stride()*sizeof(Real); - MatrixIndexT width = src.NumCols()*sizeof(Real); - CU_SAFE_CALL(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch, - width, src.NumRows(), cudaMemcpyHostToDevice)); - - CuDevice::Instantiate().AccuProfile("CuMatrixBase::CopyFromMat(from CPU)",tim.Elapsed()); - } else { - CuMatrix trans_mat(src); // Do the transpose on the GPU board. - this->CopyFromMat(trans_mat, kTrans); - } - } else -#endif - { - Mat().CopyFromMat(src, trans); - } -} - -template -template -void CuMatrixBase::CopyFromMat(const MatrixBase &src, - MatrixTransposeType trans) { - CuMatrix temp(src); - this->CopyFromMat(temp, trans); -} - -// instantiate the template above. -template -void CuMatrixBase::CopyFromMat(const MatrixBase &src, - MatrixTransposeType trans); -template -void CuMatrixBase::CopyFromMat(const MatrixBase &src, - MatrixTransposeType trans); - - -template -void CuMatrixBase::CopyFromSp(const CuSpMatrix &M) { - KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_); - if (num_rows_ == 0) - return; -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumRows(), CU2DBLOCK)); - cuda_copy_from_sp(dimGrid, dimBlock, M.Data(), data_, Dim()); - CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromSp",tim.Elapsed()); - } else -#endif - { - Mat().CopyFromSp(M.Mat()); - } -} - -template -CuMatrix::CuMatrix(const CuMatrix &other, MatrixTransposeType trans) { - if (trans == kNoTrans) - this->Resize(other.NumRows(), other.NumCols(), kUndefined); - else - this->Resize(other.NumCols(), other.NumRows(), kUndefined); - this->CopyFromMat(other, trans); -} - -template -CuMatrix::CuMatrix(const CuMatrixBase &other, MatrixTransposeType trans) { - if (trans == kNoTrans) - this->Resize(other.NumRows(), other.NumCols(), kUndefined); - else - this->Resize(other.NumCols(), other.NumRows(), kUndefined); - this->CopyFromMat(other, trans); -} - - -template -template -CuMatrix::CuMatrix(const MatrixBase &other, MatrixTransposeType trans) { - if (trans == kNoTrans) - this->Resize(other.NumRows(), other.NumCols(), kUndefined); - else - this->Resize(other.NumCols(), other.NumRows(), kUndefined); - this->CopyFromMat(other, trans); -} -// Instantiate the template above. -template -CuMatrix::CuMatrix(const MatrixBase &other, MatrixTransposeType trans); -template -CuMatrix::CuMatrix(const MatrixBase &other, MatrixTransposeType trans); -template -CuMatrix::CuMatrix(const MatrixBase &other, MatrixTransposeType trans); -template -CuMatrix::CuMatrix(const MatrixBase &other, MatrixTransposeType trans); - - -template -template -void CuMatrixBase::CopyToMat(MatrixBase *dst, - MatrixTransposeType trans) const { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - if (trans == kTrans || sizeof(OtherReal) != sizeof(Real)) { - CuMatrix this_trans(*this, trans); - this_trans.CopyToMat(dst, kNoTrans); - } else { - KALDI_ASSERT(dst->NumRows() == NumRows() && dst->NumCols() == NumCols()); - if (num_rows_ == 0) return; - Timer tim; - - MatrixIndexT src_pitch = stride_*sizeof(Real); - MatrixIndexT dst_pitch = dst->Stride()*sizeof(Real); - MatrixIndexT width = NumCols()*sizeof(Real); - CU_SAFE_CALL(cudaMemcpy2D(dst->Data(), dst_pitch, this->data_, src_pitch, - width, this->num_rows_, cudaMemcpyDeviceToHost)); - - CuDevice::Instantiate().AccuProfile("CuMatrix::CopyToMatD2H",tim.Elapsed()); - } - } else - #endif - { - dst->CopyFromMat(Mat(), trans); - } -} - -// instantiate the template above. -template -void CuMatrixBase::CopyToMat(MatrixBase *dst, - MatrixTransposeType trans) const; -template -void CuMatrixBase::CopyToMat(MatrixBase *dst, - MatrixTransposeType trans) const; -template -void CuMatrixBase::CopyToMat(MatrixBase *dst, - MatrixTransposeType trans) const; -template -void CuMatrixBase::CopyToMat(MatrixBase *dst, - MatrixTransposeType trans) const; - - - - - -template -void CuMatrix::Read(std::istream &is, bool binary) { - Matrix temp; - temp.Read(is, binary); - Destroy(); - Swap(&temp); -} - -template -void CuMatrixBase::Write(std::ostream &os, bool binary) const { - Matrix temp(this->num_rows_, this->num_cols_, kUndefined); - this->CopyToMat(&temp); - temp.Write(os, binary); -} - -template -void CuMatrixBase::SetZero() { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - CU_SAFE_CALL(cudaMemset2D(data_, stride_ * sizeof(Real), 0, - num_cols_ * sizeof(Real), num_rows_ )); - CuDevice::Instantiate().AccuProfile("CuMatrix::SetZero", tim.Elapsed()); - } else -#endif - { - Mat().SetZero(); - } -} - - - - -/* - * Methods wrapping the ANSI-C CUDA kernels - */ -template -void CuMatrixBase::Set(Real value) { - #if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - if (num_rows_ == 0) return; - Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_set_const(dimGrid, dimBlock, data_, value, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else - #endif - { - Mat().Set(value); - } -} - -// set zero the elements above the diagonal. -template -void CuMatrixBase::SetZeroAboveDiag() { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - if (num_rows_ == 0) return; - Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_set_zero_above_diag(dimGrid, dimBlock, data_, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - MatrixBase &mat = Mat(); - int32 num_rows = mat.NumRows(), num_cols = mat.NumCols(); - for (int32 r = 0; r + 1 < num_rows; r++) { - SubVector vec(mat, r), - vec_part(vec, r + 1, num_cols - (r + 1)); - vec_part.SetZero(); - } - } -} - -template -void CuMatrixBase::Add(Real value) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - if (num_rows_ == 0) return; - Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_add(dimGrid, dimBlock, data_, value, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else - #endif - { - Mat().Add(value); - } -} - -template -void CuMatrixBase::AddToDiag(Real value) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - if (num_rows_ == 0) return; - Timer tim; - // We'll create a fake matrix with "num_diag" rows, one - // columnn, and a stride of "this_stride". The y-value of - // the grid/blocks corresponds to the row, in this kernel. - MatrixIndexT num_diag = std::min(num_rows_, num_cols_), - this_stride = stride_ + 1; - dim3 dimBlock(1, CU1DBLOCK); - dim3 dimGrid(1, n_blocks(num_diag, CU1DBLOCK)); - ::MatrixDim d = { num_diag, 1, this_stride }; - cuda_add(dimGrid, dimBlock, data_, value, d); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else - #endif - { - Mat().AddToDiag(value); - } -} - -template -bool CuMatrixBase::IsUnit(Real tol) const { - // want to return: - //FrobeniusNorm(*this - I) <= tol * NumRows(), i.e.: - //sqrt (trace((*this - I)(*this-I)) <= tol * NumRows() - // trace((*this - I)(*this - I)) <= tol * NumRows() - // trace(*this * *this) + trace(I) - 2 * trace(*this) <= tol * NumRows() - // trace(*this * *this) + dim - 2*this.Trace() <= tol * NumRows() - KALDI_ASSERT(this->NumRows() == this->NumCols()); - return (TraceMatMat(*this, *this, kTrans) + this->NumRows() - 2.0 * this->Trace() <= - tol * this->NumRows()); -} - - - -template -void CuMatrixBase::Scale(Real value) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - if (num_rows_ == 0) return; - Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_scale(dimGrid, dimBlock, data_, value, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().Scale(value); - } -} - -template -void CuMatrixBase::ApplyLog() { - #if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - if (num_rows_ == 0) return; - Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_apply_log(dimGrid, dimBlock, data_, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else - #endif - { - Mat().ApplyLog(); - } -} - -template -void CuMatrixBase::MulElements(const CuMatrixBase& A) { - #if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - - KALDI_ASSERT(num_cols_ == A.NumCols()); - KALDI_ASSERT(num_rows_ == A.NumRows()); - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_mul_elements(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else - #endif - { - Mat().MulElements(A.Mat()); - } -} - -template -void CuMatrixBase::Max(const CuMatrixBase& A) { - #if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - - KALDI_ASSERT(num_cols_ == A.NumCols()); - KALDI_ASSERT(num_rows_ == A.NumRows()); - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_max(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else - #endif - { - Mat().Max(A.Mat()); - } -} - - -template -void CuMatrixBase::MulColsVec(const CuVectorBase &scale) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - - KALDI_ASSERT(scale.Dim() == NumCols()); - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_mul_cols_vec(dimGrid, dimBlock, data_, scale.data_, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().MulColsVec(scale.Vec()); - } -} - - - -template -void CuMatrixBase::MulRowsVec(const CuVectorBase &scale) { - #if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - - KALDI_ASSERT(scale.Dim() == NumRows()); - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_mul_rows_vec(dimGrid, dimBlock, data_, scale.data_, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else - #endif - { - Mat().MulRowsVec(scale.Vec()); - } -} - -template -void CuMatrixBase::MulRowsGroupMat(const CuMatrixBase &src) { - KALDI_ASSERT(src.NumCols() > 0); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - int group_size = this->NumCols() / src.NumCols(); - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), - n_blocks(NumRows(), CU2DBLOCK)); - - cuda_mul_rows_group_mat(dimGrid, dimBlock, this->data_, src.data_, - this->Dim(), src.Stride(), group_size); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().MulRowsGroupMat(src.Mat()); - } -} -template -void CuMatrixBase::GroupPnormDeriv(const CuMatrixBase &src1, - const CuMatrixBase &src2, - Real power) { - KALDI_ASSERT(src2.NumCols() > 0); - int group_size = this->NumCols() / src2.NumCols(); - KALDI_ASSERT(this->NumCols() == src2.NumCols() * group_size); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_calc_pnorm_deriv(dimGrid, dimBlock, this->data_, src1.Data(), src2.Data(), Dim(), src2.Stride(), group_size, power); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().GroupPnormDeriv(src1.Mat(), src2.Mat(), power); - } -} - -template -void CuMatrixBase::DivRowsVec(const CuVectorBase &div) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - - KALDI_ASSERT(div.Dim() == NumRows()); - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_div_rows_vec(dimGrid, dimBlock, data_, div.data_, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Vector temp(div.Vec()); // will copy. - temp.InvertElements(); - Mat().MulRowsVec(temp); - } -} - -template -void CuMatrixBase::InvertElements() { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_invert_elements(dimGrid, dimBlock, data_, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().InvertElements(); - } -} - - -template -void CuMatrixBase::AddMat(Real alpha, const CuMatrixBase& A, - MatrixTransposeType transA) { - -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - if (transA == kNoTrans) { - KALDI_ASSERT(A.NumRows() == num_rows_ && A.NumCols() == num_cols_); - } else { - KALDI_ASSERT(A.NumCols() == num_rows_ && A.NumRows() == num_cols_); - } - if (num_rows_ == 0) return; - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - cuda_add_mat(dimGrid, dimBlock, alpha, A.data_, data_, Dim(), A.Stride(), - (transA == kTrans ? 1 : 0)); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().AddMat(alpha, A.Mat(), transA); - } -} - -template -void CuMatrixBase::AddMatMatDivMat(const CuMatrixBase &A, - const CuMatrixBase &B, const CuMatrixBase &C) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - - KALDI_ASSERT(num_rows_ == A.num_rows_ && num_cols_ == A.num_cols_); - KALDI_ASSERT(num_rows_ == B.num_rows_ && num_cols_ == B.num_cols_); - KALDI_ASSERT(num_rows_ == C.num_rows_ && num_cols_ == C.num_cols_); - if (num_rows_ == 0) return; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_add_mat_mat_div_mat(dimGrid, dimBlock, A.data_, B.data_, C.data_, data_, Dim(), A.Stride(), B.Stride(), C.Stride()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().AddMatMatDivMat(A.Mat(), B.Mat(), C.Mat()); - } -} - -template -void CuMatrixBase::AddVecToCols(Real alpha, - const CuVectorBase &col, - Real beta) { - if (col.Dim() != NumRows()) { - KALDI_ERR << "Non matching dimensions: Rows:" << NumRows() << " VectorDim:" << col.Dim(); - } - - #if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_add_vec_to_cols(dimGrid, dimBlock, alpha, col.data_, beta, data_, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else - #endif - { - if (beta != 1.0) Mat().Scale(beta); - Mat().AddVecToCols(alpha, col.Vec()); - } -} - - - -template -void CuMatrixBase::AddVecToRows(Real alpha, - const CuVectorBase &row, - Real beta) { - if (row.Dim() != NumCols()) { - KALDI_ERR << "Non matching dimensions: Cols:" << NumCols() << " VectorDim:" << row.Dim(); - } -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_add_vec_to_rows(dimGrid, dimBlock, alpha, row.data_, beta, data_, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - if (beta != 1.0) Mat().Scale(beta); - Mat().AddVecToRows(alpha, row.Vec()); - } -} - - - -/* - * Method wrapping the CUBLAS function GEMM - */ -template -void CuMatrixBase::AddMatMat( - Real alpha, const CuMatrixBase &A, MatrixTransposeType transA, - const CuMatrixBase &B, MatrixTransposeType transB, Real beta) { - - - // CUBLAS is col-major, cudamatrix is row-major, how to do the mapping? - // keep trans..., just swap A&B matrices: A->B B->A - MatrixIndexT m = ((transB==kTrans)? B.NumRows() : B.NumCols()); - MatrixIndexT n = ((transA==kTrans)? A.NumCols() : A.NumRows()); - MatrixIndexT k = ((transB==kTrans)? B.NumCols() : B.NumRows()); - MatrixIndexT k1 = ((transA==kTrans)? A.NumRows() : A.NumCols()); - - KALDI_ASSERT(m == NumCols()); - KALDI_ASSERT(n == NumRows()); - KALDI_ASSERT(k == k1); - - if (m == 0) return; - - -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - - cublas_gemm((transB==kTrans?'T':'N'), (transA==kTrans?'T':'N'), m, n, k, - alpha, B.data_, B.Stride(), A.data_, A.Stride(), - beta, data_, Stride()); - - CU_SAFE_CALL(cublasGetError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().AddMatMat(alpha, A.Mat(), transA, B.Mat(), transB, beta); - } -} - - - -template -void CuMatrixBase::SymAddMat2( - Real alpha, const CuMatrixBase &A, MatrixTransposeType transA, - Real beta) { - KALDI_ASSERT(num_rows_ == num_cols_ && - ((transA == kNoTrans && A.num_rows_ == num_rows_) || - (transA == kTrans && A.num_cols_ == num_cols_))); - if (num_rows_ == 0) return; - KALDI_ASSERT(A.data_ != data_); - -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - char trans = (transA == kTrans ? 'N' : 'T'); - MatrixIndexT A_other_dim = (transA == kNoTrans ? A.num_cols_ : A.num_rows_); - - cublas_syrk('U', trans, num_rows_, A_other_dim, alpha, A.Data(), - A.Stride(), beta, this->data_, this->stride_); - - CU_SAFE_CALL(cublasGetError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().SymAddMat2(alpha, A.Mat(), transA, beta); - } -} - - -template -void CuMatrixBase::AddDiagVecMat( - const Real alpha, CuVectorBase &v, - const CuMatrixBase &M, MatrixTransposeType transM, - Real beta) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - if (transM == kNoTrans) { - KALDI_ASSERT(SameDim(*this, M)); - } else { - KALDI_ASSERT(M.NumRows() == NumCols() && M.NumCols() == NumRows()); - } - KALDI_ASSERT(v.Dim() == this->NumRows()); - - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - - dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK), - n_blocks(num_rows_, CU2DBLOCK)); - - MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1; - if (transM == kTrans) - std::swap(M_row_stride, M_col_stride); - cuda_add_diag_vec_mat(dimGrid, dimBlock, alpha, data_, Dim(), - v.Data(), M.Data(), M_row_stride, M_col_stride, beta); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().AddDiagVecMat(alpha, v.Vec(), M.Mat(), transM, beta); - } -} - - -template -void CuMatrixBase::AddMatDiagVec( - const Real alpha, - const CuMatrixBase &M, MatrixTransposeType transM, - CuVectorBase &v, - Real beta) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - if (transM == kNoTrans) { - KALDI_ASSERT(SameDim(*this, M)); - } else { - KALDI_ASSERT(M.NumRows() == NumCols() && M.NumCols() == NumRows()); - } - KALDI_ASSERT(v.Dim() == this->NumCols()); - - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - // Caution, this dimGrid is not the same way around as much of the other - // code: going forward, I want to use the (rows, cols) order. - dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK), n_blocks(num_cols_, CU2DBLOCK)); - - MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1; - if (transM == kTrans) std::swap(M_row_stride, M_col_stride); - - cuda_add_mat_diag_vec(dimGrid, dimBlock, alpha, data_, Dim(), - M.Data(), M_row_stride, M_col_stride, v.Data(), beta); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().AddMatDiagVec(alpha, M.Mat(), transM, v.Vec(), beta); - } -} - -template -void CuMatrixBase::AddMatMatElements(Real alpha, - const CuMatrixBase &A, const CuMatrixBase &B, Real beta) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - cuda_add_mat_mat_elements(dimGrid, dimBlock, this->data_, A.Data(), B.Data(), Dim(), A.Stride(), B.Stride(), alpha, beta); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().AddMatMatElements(alpha, A.Mat(), B.Mat(), beta); - } -} - - -template -void CuMatrixBase::Sigmoid(const CuMatrixBase &src) { - KALDI_ASSERT(SameDim(*this, src)); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK)); - - cuda_sigmoid(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else - #endif - { - Mat().Sigmoid(src.Mat()); - } -} - -template -void CuMatrixBase::SoftHinge(const CuMatrixBase &src) { - KALDI_ASSERT(SameDim(*this, src)); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK)); - - cuda_soft_hinge(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else - #endif - { - Mat().SoftHinge(src.Mat()); - } -} - -template -void CuMatrixBase::GroupPnorm(const CuMatrixBase &src, Real power) { - int group_size = src.NumCols() / this->NumCols(); - KALDI_ASSERT(src.NumCols() == this->NumCols() * group_size && - this->NumRows() == src.NumRows()); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK)); - cuda_group_pnorm(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride(), group_size, power); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else - #endif - { - Mat().GroupPnorm(src.Mat(), power); - } -} - -/* -Think of sv_labels as a Matrix, denoting the "correct" label of each frame to -each phone-state; it's very likely to contain a LOT of zeros - -tot_weight = the sum of ALL element in matrix sv_labels -tot_objf = the sum of the product of (each element in matrix sv_labels) and (the - log of its counterpart in matrix output) - -an element in "this" matrix = (the element in matrix sv_labels) divided by (the element in matrix output) -*/ -template -void CuMatrix::CompObjfAndDeriv(const std::vector >& sv_labels, - const CuMatrix &output, - Real *tot_objf, Real* tot_weight) { - { // check the input. - typedef typename std::vector >::const_iterator Iter; - MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_; - for (Iter iter = sv_labels.begin(); iter != sv_labels.end(); ++iter) { - KALDI_ASSERT(iter->row < num_rows && iter->row >= 0 && - iter->column < num_cols && iter->column >= 0); - } - } - -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - if (sv_labels.empty()) { - KALDI_WARN << "Empty supervision labels"; - *tot_objf = 0.0; - *tot_weight = 0.0; - return; - } - void *addr = CuDevice::Instantiate().Malloc(sv_labels.size() * sizeof(MatrixElement)); - CU_SAFE_CALL(cudaMemcpy(addr, sv_labels.data(), sv_labels.size() * sizeof(MatrixElement), cudaMemcpyHostToDevice)); - Timer tim; - CuVector tmp(2, kUndefined); - int dimBlock(CU1DBLOCK); - int dimGrid = 1; // only 1 block here. we have loops in each thread. - cuda_comp_obj_deriv(dimGrid, dimBlock, (MatrixElement*)addr, - sv_labels.size(), output.Data(), output.Dim(), - this->Data(), this->Dim(), tmp.Data()); - Vector tmp_cpu(tmp); - *tot_objf = tmp_cpu(0); - *tot_weight = tmp_cpu(1); - CuDevice::Instantiate().Free(addr); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - *tot_objf = 0.0; - *tot_weight = 0.0; - for(int32 i = 0; i= 0 && label < nnet_.OutputDim()); - Real this_prob = output(m, label); - KALDI_ASSERT(this_prob >= 0.99e-20); // we floored to 1.0e-20 in SoftmaxLayer. - *tot_objf += weight * log(this_prob); - *tot_weight += weight; - (*this)(m, label) += weight / this_prob; - } - } -} - -template // Y->this, X->src -void CuMatrixBase::ApplySoftMaxPerRow(const CuMatrixBase &src) { - KALDI_ASSERT(SameDim(*this, src)); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - size_t dimBlock = src.num_cols_ > CU1DBLOCK ? CU1DBLOCK : src.num_cols_; - size_t dimGrid = src.num_rows_; - cuda_softmax_reduce(dimGrid, dimBlock, data_, src.data_, Dim(), src.Stride()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else - #endif - { - MatrixBase &mat(this->Mat()); - mat.CopyFromMat(src.Mat()); - for(MatrixIndexT r = 0; r < mat.NumRows(); r++) { - mat.Row(r).ApplySoftMax(); - } - } -} - -template // Y->this, X->src -void CuMatrixBase::ApplyLogSoftMaxPerRow(const CuMatrixBase &src) { - KALDI_ASSERT(SameDim(*this, src)); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - size_t dimBlock = src.num_cols_ > CU1DBLOCK ? CU1DBLOCK : src.num_cols_; - size_t dimGrid = src.num_rows_; - cuda_log_softmax_reduce(dimGrid, dimBlock, - data_, src.data_, Dim(), src.Stride()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - MatrixBase &mat(this->Mat()); - mat.CopyFromMat(src.Mat()); - for(MatrixIndexT r = 0; r < mat.NumRows(); r++) { - mat.Row(r).ApplyLogSoftMax(); - } - } -} - -// DiffSigmoid(Ein, Y, Eout) -> Eout.DiffSigmoid(Y, Ein). -template // Eout -> *this, Ein -> diff, Y -> value -void CuMatrixBase::DiffSigmoid(const CuMatrixBase &value, - const CuMatrixBase &diff) { - KALDI_ASSERT(SameDim(*this, value) && SameDim(*this, diff)); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK), n_blocks(num_rows_, CU2DBLOCK)); - - cuda_diff_sigmoid(dimGrid, dimBlock, data_, diff.data_, value.data_, Dim(), diff.Stride(), value.Stride()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().DiffSigmoid(value.Mat(), diff.Mat()); - } -} - - -template -void CuMatrixBase::Tanh(const CuMatrixBase &src) { - KALDI_ASSERT(SameDim(*this, src)); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK)); - - cuda_tanh(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().Tanh(src.Mat()); - } -} - - - -template // Ein -> diff, Y -> value -void CuMatrixBase::DiffTanh(const CuMatrixBase &value, - const CuMatrixBase &diff) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK), n_blocks(num_rows_, CU2DBLOCK)); - - cuda_diff_tanh(dimGrid, dimBlock, data_, diff.data_, value.data_, Dim(), diff.Stride(), value.Stride()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().DiffTanh(value.Mat(), diff.Mat()); - } -} - -template -void CuMatrixBase::FindRowMaxId(CuArray *id) const { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - - // initialize the vectors - CuVector max(num_rows_); - max.Set(-1e21); - id->Resize(num_rows_); - id->Set(-1); - - MatrixDim d=Dim(); // only stride will be used! - - // process per 256 column blocks - for (int32 block = 0; (block+1)*256 <= num_cols_; block++) { - dim3 dimBlock(CU1DBLOCK, 1); - dim3 dimGrid(1, num_rows_); - int32 offset = block*CU1DBLOCK; - - cuda_find_row_max_id(dimGrid, dimBlock, data_ + offset, - max.data_, id->Data(), offset, d); - } - - // process the remainder - int32 div = num_cols_ / 256; - int32 mod = num_cols_ % 256; - if (mod != 0) { - dim3 dimBlock(mod, 1); - dim3 dimGrid(1, num_rows_); - int32 offset=div*256; - - cuda_find_row_max_id(dimGrid, dimBlock, data_ + offset, - max.data_, id->Data(), offset, d); - } - // now we have the indices! - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - // allocate index buffer - id->Resize(num_rows_); - id->Set(-1); - // find maxima - MatrixIndexT num_rows = num_rows_, num_cols = num_cols_; - for(MatrixIndexT r = 0; r < num_rows; r++) { - Real max = -1e21; - int32 max_id = -1; - const Real *row_data = Mat().RowData(r); - for(MatrixIndexT c = 0; c < num_cols; c++) { - if (max < row_data[c]) { - max = row_data[c]; - max_id = c; - } - } - id->Data()[r] = max_id; - } - } -} - -template -void CuMatrixBase::DiffXent(const CuArray &tgt, - CuVector *log_post_tgt) { - - KALDI_ASSERT(tgt.Dim() == num_rows_); - log_post_tgt->Resize(tgt.Dim()); - -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - - dim3 dimBlock(1, CU2DBLOCK*8); - dim3 dimGrid(1, n_blocks(tgt.Dim(), CU2DBLOCK*8)); - cuda_diff_xent(dimGrid, dimBlock, tgt.Data(), data_, - log_post_tgt->data_, Dim()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - MatrixIndexT num_rows = num_rows_; - for(int32 r = 0; r < num_rows; r++) { - int32 col_tgt = tgt.Data()[r]; - Real &value = Mat()(r, col_tgt); - log_post_tgt->Vec()(r) = log(value); - value -= 1.0; - } - } -} - - -template -void CuMatrixBase::Cholesky(CuMatrixBase *inv_cholesky) { - KALDI_ASSERT(this->NumRows() == this->NumCols()); - const int32 block_size = 64; // We can tune this. -#if HAVE_CUDA == 1 - bool have_gpu = CuDevice::Instantiate().Enabled(); -#else - bool have_gpu = false; -#endif - if (this->NumRows() == 0) { - return; - } - if (inv_cholesky == NULL && this->NumRows() >= block_size * 2 && have_gpu) { - // Even if the user did not request the inverse Cholesky, for large enough - // matrices (on GPUs) it's going to be more efficient to compute it anyway - // as the recursion depends on it. - CuMatrix inv(this->NumRows(), this->NumCols()); - Cholesky(&inv); - return; - } - if (this->NumRows() <= block_size || inv_cholesky == NULL || !have_gpu) { - // Don't recurse: compute the Cholesky (and inverse Cholesky, if requested) - // directly, on the CPu. - int32 dim = this->NumRows(); - CuSpMatrix this_sp(dim, kUndefined); - this_sp.CopyFromMat(*this, kTakeLower); - SpMatrix this_sp_cpu(this_sp); - TpMatrix C_cpu(dim); - C_cpu.Cholesky(this_sp_cpu); - CuTpMatrix C(C_cpu); - this->CopyFromTp(C); - if (inv_cholesky != NULL) { - C_cpu.Invert(); // Get inverse Cholesky on CPU. - C.CopyFromTp(C_cpu); - inv_cholesky->CopyFromTp(C); // Copy inverse Cholesky from CPU. - } - return; - } - // At this point, if none of the other cases apply, we recurse. - - // The selection of dim1 is a heuristic. We could also just take half. - int32 tot_dim = this->NumRows(); - int32 dim1; - // Break it up into a whole number of blocks, for better memory alignment. - // The line below, setting dim1 can be decided on a heuristic basis: from - // the point of view of correctness, it can really be any value - // 0 < dim1 < tot_dim. - dim1 = block_size * std::max(1, tot_dim / (2 * block_size)); - - int32 dim2 = tot_dim - dim1; - CuSubMatrix this_11(*this, 0, dim1, 0, dim1), - this_12(*this, 0, dim1, dim1, dim2), - this_21(*this, dim1, dim2, 0, dim1), - this_22(*this, dim1, dim2, dim1, dim2); - CuSubMatrix inv_11(*inv_cholesky, 0, dim1, 0, dim1), - inv_12(*inv_cholesky, 0, dim1, dim1, dim2), - inv_21(*inv_cholesky, dim1, dim2, 0, dim1), - inv_22(*inv_cholesky, dim1, dim2, dim1, dim2); - /* - Here is the math on block-wise Cholesky. We'll use a Matlab-like notation for blocks of a matrix, - e.g. [ A B; C D ], and also for transposes, e.g. A' is the transpose of A. - Let A be the input matrix; we want to compute both its Cholesky L and its inverse Cholesky, which - we'll call M. - OK. let L = [ L11 0; L21 L22 ] be the Cholesky factor of A. - We have A = L L' = [ L11 0; L21 L22 ] * [ L11' L21'; 0 L22' ]. Multiplying it out, - if A = [ A11 A12; A21 A22 ]; then - A11 = L11 L11', A21 = L21 L11', A22 = L21 L21' + L22 L22', and A12 = A21'. - - We also want an expression for the inverse of L (we call this M). - If M = [ M11 0; M21 M22 ], then it's not hard to see that - M11 = inv(L11), M22 = inv(L22). - We can work out M21 as follows. We know that [ L11 0; L21 L22 ] [ M11 0; M21 M22 ] = [ I 0; 0 I ]. - Considering the zero on the bottom of the rhs, we have: L21 M11 + L22 M21 = 0, which gives us: - M21 = - L22^{-1} L21 M11 = - M22 L21 M11. - - Next, we want expressions for L21 and L22. From the equation A21 = L21 L11', we have: - L21 = A21 inv(L11') = A21 M11' - We can compute L22 and M22 recursively by doing Cholesky (and computing the inverse Cholesky) - on the quantity T = (A22 - L21 L21'). [we give it the name T just for easy reference.] - - Computationally, we do this as follows: - (1) Recurse to get L11 and M11. - (2) Compute L21 = A21 M11' - (3) Compute T = A22 - L21 L21' - (4) Recurse on T to get L22 and M22. - (5) Compute M21 = -M22 L21 M11. - Next, we have to consider the in-place nature of the computation, since L overwrites A - [M has its own storage, in "inv_cholesky"]. - We address this here: - (1) is in-place [L11 replaces A11, M11 has its own storage]. - (2) L21 gets written where M21 belongs. - (3) T replaces A22. - (4) is in-place [L22 replaces T where A22 was, M22 has its own storage] - (5):(a) we first compute the transpose of (L21 M11) is done in the upper part of A/L, - where A12 or L12 would be. Define a temporary expression - U = (L21 M11)' = M11' L21'; this goes where A12 or L12 would be. - (b) copy L21 to where it should be, in *this. - (c) Compute M21 = -M22 U', in the correct place for M21. - (d) zero L12 and M12. */ - - // (1) compute L11 and M11. - this_11.Cholesky(&inv_11); - // (2) compute L21 = A21 M11'. For now it's in the "wrong place", where M21 should be. - inv_21.AddMatMat(1.0, this_21, kNoTrans, inv_11, kTrans, 0.0); - // (3) compute T = A22 - L21 L21'. Note: only the lower triangle of T will be valid, but - // that's OK because Cholesky will ignore the upper part. - this_22.SymAddMat2(-1.0, inv_21, kNoTrans, 1.0); - // (4) Recurse to compute L22 and M22. - this_22.Cholesky(&inv_22); - // (5)(a) compute U = M11' L21'. We use the storage of this_12 for this. Note that L21 is - // currently where M21 should be. - this_12.AddMatMat(1.0, inv_11, kTrans, inv_21, kTrans, 0.0); - // (5)(b) copy L21 to where it should be. - this_21.CopyFromMat(inv_21); - // (5)(c) compute M21 = -M22 U'. - inv_21.AddMatMat(-1.0, inv_22, kNoTrans, this_12, kTrans, 0.0); - // (5)(d) zero L12 and M12. - this_12.SetZero(); - inv_12.SetZero(); -} - - - -template -void CuMatrixBase::SymInvertPosDef() { - KALDI_ASSERT(num_rows_ == num_cols_); - if (num_rows_ == 0) return; -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - CuMatrix inv_cholesky(num_rows_, num_rows_); - this->Cholesky(&inv_cholesky); - // note: SymAddMat2 only updates lower part of *this. - this->SymAddMat2(1.0, inv_cholesky, kTrans, 0.0); - this->CopyLowerToUpper(); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - SpMatrix temp_sp(this->Mat(), kTakeLower); - TpMatrix C(temp_sp.NumRows(), kUndefined); - C.Cholesky(temp_sp); - C.Invert(); - temp_sp.AddTp2(1.0, C, kTrans, 0.0); - this->Mat().CopyFromSp(temp_sp); - // was previously just: CuSpMatrix::Invert(). - } -} - -template -bool CuMatrixBase::ApproxEqual(const CuMatrixBase &other, - float tol) const { - CuMatrix diff(*this); - diff.AddMat(-1.0, other); - return (diff.FrobeniusNorm() <= tol * (*this).FrobeniusNorm()); -} - -template -Real TraceMatMat(const CuMatrixBase &A, - const CuMatrixBase &B, - MatrixTransposeType trans) { - if (A.num_rows_ == 0) { - KALDI_ASSERT(B.num_rows_ == 0); - return 0.0; - } - Real result = 0; -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - if (trans == kNoTrans) { - KALDI_ASSERT(A.NumRows() == B.NumCols() && A.NumCols() == B.NumRows()); - } else { - KALDI_ASSERT(A.NumRows() == B.NumRows() && A.NumCols() == B.NumCols()); - } - if (A.NumRows() * A.NumCols() > 16384) { - // This version in which we don't use a special-purpose kernel, but - // do AddDiagMat on a temporary vector and returns its sum, seems to be - // faster for larger matrices. The cutoff is approximate and - // we only looked at the time on square matrices, which - // is what we test in cu-matrix-speed-test.cc. - CuVector sum_vec(A.NumRows()); - sum_vec.AddDiagMatMat(1.0, A, kNoTrans, - B, trans, 0.0); - return sum_vec.Sum(); - } else { - Timer tim; - // the sizes of result_vec must match what we - // call the kernels with, in cu-kernels.cu - CuVector result_vec(trans == kTrans ? 4 : 2, kUndefined); - if (trans == kNoTrans) { - cuda_trace_mat_mat(A.Data(), B.Data(), A.Dim(), B.Stride(), result_vec.Data()); - } else { - cuda_trace_mat_mat_trans(A.Data(), B.Data(), A.Dim(), B.Stride(), result_vec.Data()); - } - CU_SAFE_CALL(cudaGetLastError()); - Vector result_cpu(result_vec); // copying from CUDA faster than summing in CUDA. - result = result_cpu.Sum(); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } - } else -#endif - { - result = TraceMatMat(A.Mat(), B.Mat(), trans); - } - return result; -} - -template -float TraceMatMat(const CuMatrixBase &A, - const CuMatrixBase &B, - MatrixTransposeType trans); -template -double TraceMatMat(const CuMatrixBase &A, - const CuMatrixBase &B, - MatrixTransposeType trans); - - -template -void CuMatrixBase::CopyRowsFromVec(const CuVectorBase &v) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - if (v.Dim() == num_rows_*num_cols_) { - if (stride_ == num_cols_) { - const Real* v_data = v.Data(); - CU_SAFE_CALL(cudaMemcpy(data_, v_data, - sizeof(Real)*num_rows_*num_cols_, - cudaMemcpyDeviceToDevice)); - } else { - CU_SAFE_CALL(cudaMemcpy2D(data_, stride_ * sizeof(Real), v.Data(), - num_cols_*sizeof(Real), num_cols_*sizeof(Real), - num_rows_, - cudaMemcpyDeviceToDevice)); - } - } else if (v.Dim() == num_cols_) { - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - // this is a newer kernel where (x,y) dims represent (rows,cols). - dim3 dimGrid(n_blocks(NumRows(),CU2DBLOCK), n_blocks(NumCols(),CU2DBLOCK)); - cuda_copy_rows_from_vec(dimGrid, dimBlock, data_, this->Dim(), v.Data()); - } else { - KALDI_ERR << "Wrong sized arguments"; - } - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().CopyRowsFromVec(v.Vec()); - } -} - -template -void CuMatrixBase::CopyRowsFromVec(const VectorBase &v) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - if (v.Dim() == num_rows_*num_cols_) { - if (stride_ == num_cols_) { - const Real* v_data = v.Data(); - cudaMemcpy(data_, v_data, sizeof(Real)*num_rows_*num_cols_, cudaMemcpyHostToDevice); - } else { - const Real *v_data = v.Data(); - for (MatrixIndexT r = 0; r < num_rows_; r++) { - Real *row_data = RowData(r); - cudaMemcpy(row_data, v_data, sizeof(Real)*num_cols_, cudaMemcpyHostToDevice); - v_data += num_cols_; - } - } - } else if (v.Dim() == num_cols_) { - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - // This is a newer kernel where x corresponds to NumRows() and y to NumCols(). - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); - - cuda_copy_rows_from_vec(dimGrid, dimBlock, this->data_, this->Dim(), v.Data()); - CU_SAFE_CALL(cudaGetLastError()); - - /* const Real *v_data = v.Data(); - for (MatrixIndexT r = 0; r < num_rows_; r++) - cudaMemcpy(RowData(r), v_data, sizeof(Real)*num_cols_, cudaMemcpyHostToDevice); */ - } else { - KALDI_ERR << "Wrong sized arguments"; - } - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().CopyRowsFromVec(v); - } -} - - -template -void CuMatrixBase::CopyColFromVec(const CuVectorBase &v, - const MatrixIndexT col) { - KALDI_ASSERT(v.Dim() == num_rows_ && - static_cast(col) < - static_cast(num_cols_)); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - int dimBlock(CU1DBLOCK); - int dimGrid(n_blocks(NumRows(), CU1DBLOCK)); - cuda_copy_col_from_vec(dimGrid, dimBlock, data_, v.Data(), col, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().CopyColFromVec(v.Vec(), col); - } -} - -template -void CuMatrixBase::ApplyPow(Real power) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); - - cuda_apply_pow(dimGrid, dimBlock, data_, power, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().ApplyPow(power); - } -} - -template -void CuMatrixBase::ApplyPowAbs(Real power, bool include_sign) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); - - cuda_apply_pow_abs(dimGrid, dimBlock, data_, power, include_sign, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().ApplyPowAbs(power, include_sign); - } -} - -template -void CuMatrixBase::ApplyHeaviside() { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); - - cuda_apply_heaviside(dimGrid, dimBlock, data_, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().ApplyHeaviside(); - } -} - - -template -void CuMatrixBase::ApplyExp() { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_apply_exp(dimGrid, dimBlock, data_, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().ApplyExp(); - } -} - - -template -void CuMatrixBase::ApplyFloor(Real floor_val) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_apply_floor(dimGrid, dimBlock, data_, floor_val, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().ApplyFloor(floor_val); - } -} - -template -void CuMatrixBase::ApplyCeiling(Real ceiling_val) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().ApplyCeiling(ceiling_val); - } -} - - -template -void VectorBase::CopyRowsFromMat(const CuMatrixBase &mat) { - KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows()); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - if (mat.Stride() == mat.NumCols()) { - cudaMemcpy(data_, mat.Data(), sizeof(Real)*dim_, cudaMemcpyDeviceToHost); - } else { - Real* vec_data = data_; - for (MatrixIndexT r = 0; r < mat.NumRows(); r++) { - cudaMemcpy(vec_data, mat.RowData(r), sizeof(Real) * mat.NumCols(), - cudaMemcpyDeviceToHost); - vec_data += mat.NumCols(); +template +CuSparseMatrix::CuSparseMatrix(const SparseMatrix &smat) { + num_rows_ = smat.NumRows(); + num_cols_ = smat.NumCols(); + if (num_rows_ == 0 || num_cols_ == 0) return; +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + std::vector > cpu_elements; + for (int32 i = 0; i < smat.NumRows(); ++i) { + for (int32 j = 0; j < (smat.Data() + i)->NumElements(); ++j) { + MatrixElement e; + e.row = i; + e.column = (smat.Data() + i)->Data()->first; + e.weight = (smat.Data() + i)->Data()->second; + cpu_elements.push_back(e); } } - CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyRowsFromMat", tim.Elapsed()); - } else -#endif - { - CopyRowsFromMat(mat.Mat()); - } -} - -// Instantiate the template above. -template -void VectorBase::CopyRowsFromMat(const CuMatrixBase &mat); -template -void VectorBase::CopyRowsFromMat(const CuMatrixBase &mat); - - -template -void CuMatrixBase::CopyCols(const CuMatrixBase &src, - const CuArray &indices) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - KALDI_ASSERT(indices.Dim() == NumCols()); - KALDI_ASSERT(NumRows() == src.NumRows()); - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - // This kernel, as it is newer has the (x,y) dims as (rows,cols). - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK)); - cuda_copy_cols(dimGrid, dimBlock, data_, src.Data(), indices.Data(), Dim(), src.Stride()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().CopyCols(src.Mat(), indices.Data()); - } -} - - -template -void CuMatrixBase::CopyRows(const CuMatrixBase &src, - const CuArray &indices) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - KALDI_ASSERT(static_cast(indices.Dim()) == NumRows()); - KALDI_ASSERT(NumCols() == src.NumCols()); - - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - // This kernel, as it is newer has the (x,y) dims as (rows,cols). - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK)); - cuda_copy_rows(dimGrid, dimBlock, data_, src.Data(), indices.Data(), Dim(), src.Stride()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().CopyRows(src.Mat(), indices.Data()); - } -} - - -template -void CuMatrixBase::CopyRows(const CuArray &src) { - if (NumRows() == 0) return; -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - KALDI_ASSERT(static_cast(src.Dim()) == NumRows()); - - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); - cuda_copy_rows(dimGrid, dimBlock, data_, src.Data(), Dim()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Mat().CopyRows(src.Data()); - } -} - - -template -void CuMatrixBase::CopyToRows(const CuArray &dst) const { - if (NumRows() == 0) return; -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - KALDI_ASSERT(static_cast(dst.Dim()) == NumRows()); - - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); - cuda_copy_to_rows(dimGrid, dimBlock, dst.Data(), data_, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + elements_.CopyFromVec(cpu_elements); } else #endif { - Mat().CopyToRows(dst.Data()); + this->Mat() = smat; } } - -template -void CuMatrixBase::AddRows(Real alpha, - const CuMatrixBase &src, - const CuArray &indexes) { - if (NumRows() == 0) return; +template +MatrixElement* CuSparseMatrix::Data() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - KALDI_ASSERT(static_cast(indexes.Dim()) == NumRows()); - KALDI_ASSERT(src.NumCols() == NumCols()); - - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); - cuda_add_rows(dimGrid, dimBlock, alpha, - data_, src.Data(), indexes.Data(), Dim(), src.Stride()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + if (elements_.Dim() == 0) return NULL; + else return elements_.Data(); } else #endif { - Mat().AddRows(alpha, src.Mat(), indexes.Data()); + return NULL; } } - -template -void CuMatrixBase::AddRows(Real alpha, const CuArray &src) { - if (NumRows() == 0) return; +template +const MatrixElement* CuSparseMatrix::Data() const { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - KALDI_ASSERT(static_cast(src.Dim()) == NumRows()); - - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); - cuda_add_rows(dimGrid, dimBlock, alpha, data_, src.Data(), Dim()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + if (elements_.Dim() == 0) return NULL; + else return elements_.Data(); } else #endif { - Mat().AddRows(alpha, src.Data()); + return NULL; } } - -template -void CuMatrixBase::AddToRows(Real alpha, - const CuArray &dst) const { - if (NumRows() == 0) return; +template +CuSparseMatrix& CuSparseMatrix::operator = ( + const SparseMatrix &smat) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - KALDI_ASSERT(static_cast(dst.Dim()) == NumRows()); - - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); - cuda_add_to_rows(dimGrid, dimBlock, alpha, dst.Data(), data_, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + CuSparseMatrix tmp(smat); + elements_ = tmp.elements_; } else #endif { - Mat().AddToRows(alpha, dst.Data()); - } -} - - -template -void CuMatrixBase::SumColumnRanges(const CuMatrixBase &src, - const CuArray &indices) { - KALDI_ASSERT(static_cast(indices.Dim()) == NumCols()); - KALDI_ASSERT(NumRows() == src.NumRows()); - if (NumRows() == 0) return; -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - // This kernel, as it is newer has the (x,y) dims as (rows,cols). - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK)); - cuda_sum_column_ranges(dimGrid, dimBlock, data_, Dim(), src.Data(), src.Dim(), indices.Data()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { // Implement here for the CPU.. - int32 num_rows = this->num_rows_, num_cols = this->num_cols_, - this_stride = this->stride_, src_stride = src.stride_; - Real *data = this->data_; - const Real *src_data = src.data_; - const Int32Pair *indices_data = indices.Data(); - for (int32 row = 0; row < num_rows; row++) { - for (int32 col = 0; col < num_cols; col++) { - int32 start_col = indices_data[col].first, - end_col = indices_data[col].second; - Real sum = 0.0; - for (int32 src_col = start_col; src_col < end_col; src_col++) - sum += src_data[row * src_stride + src_col]; - data[row * this_stride + col] = sum; - } - } - } -} - - -template -void CuMatrixBase::AddRowRanges(const CuMatrixBase &src, - const CuArray &indexes) { - KALDI_ASSERT(static_cast(indexes.Dim()) == NumCols()); - KALDI_ASSERT(src.NumCols() >= NumCols()); - if (NumRows() == 0) return; -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); - cuda_add_row_ranges(dimGrid, dimBlock, - data_, Dim(), src.Data(), src.Dim(), indexes.Data()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { // Implement here for the CPU.. - int32 num_rows = this->num_rows_, num_cols = this->num_cols_, - this_stride = this->stride_, src_stride = src.stride_; - Real *data = this->data_; - const Real *src_data = src.data_; - const Int32Pair *indexes_data = indexes.Data(); - for (int32 row = 0; row < num_rows; row++) { - for (int32 col = 0; col < num_cols; col++) { - int32 start_row = indexes_data[col].first, - end_row = indexes_data[col].second; - Real sum = 0.0; - for (int32 src_row = start_row; src_row < end_row; src_row++) - sum += src_data[src_row * src_stride + col]; - data[row * this_stride + col] += sum; - } - } + this->Mat() = smat; } + return *this; } - -template -void CuMatrixBase::CopyLowerToUpper() { - KALDI_ASSERT(num_cols_ == num_rows_); - if (num_rows_ == 0) return; +template +CuSparseMatrix& CuSparseMatrix::operator = ( + const CuSparseMatrix &smat) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - int32 dim = this->num_rows_; - dim3 dimGrid(n_blocks(dim, CU2DBLOCK), - n_blocks(dim, CU2DBLOCK)); - cuda_copy_low_upp(dimGrid, dimBlock, data_, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + elements_ = smat.elements_; } else #endif { - Mat().CopyLowerToUpper(); + this->Mat() = smat.Mat(); } + return *this; } -template -void CuMatrixBase::CopyUpperToLower() { - KALDI_ASSERT(num_cols_ == num_rows_); - if (num_rows_ == 0) return; +template +void CuSparseMatrix::Swap(SparseMatrix *smat) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - int32 dim = this->num_rows_; - dim3 dimGrid(n_blocks(dim, CU2DBLOCK), - n_blocks(dim, CU2DBLOCK)); - cuda_copy_upp_low(dimGrid, dimBlock, data_, Dim()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { - Mat().CopyUpperToLower(); - } -} - - -template -Real CuMatrixBase::Sum() const { - CuVector row_sum(NumCols()); - row_sum.AddRowSumMat(1.0, *this, 0.0); - return row_sum.Sum(); -} - - -template -Real CuMatrixBase::Max() const { - Timer tim; - // TODO rewrite in CUDA, - Matrix tmp(NumRows(), NumCols(), kUndefined); - CopyToMat(&tmp); - Real ans = tmp.Max(); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } -#endif - return ans; -} - - -template -Real CuMatrixBase::Min() const { - Timer tim; - // TODO rewrite in CUDA, - Matrix tmp(NumRows(), NumCols(), kUndefined); - CopyToMat(&tmp); - Real ans = tmp.Min(); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } -#endif - return ans; -} - - -template -Real CuMatrixBase::Trace(bool check_square) const { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - if (check_square) KALDI_ASSERT(this->num_rows_ == this->num_cols_); - MatrixIndexT dim = std::min(this->num_rows_, this->num_cols_); - CuVector tmp(1, kUndefined); // for result. - int dimBlock(CU1DBLOCK); - int dimGrid = 1;// only 1 block here. we have loops in each thread //(n_blocks(dim_, CU1DBLOCK)); - cuda_vec_sum(dimGrid, dimBlock, data_, tmp.Data(), dim, Stride() + 1); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile("CuVectorBase::Sum", tim.Elapsed()); - return tmp(0); - } else -#endif - { - return Mat().Trace(check_square); - } -} - - - - -template -void CuMatrixBase::SetRandn() { - if (num_rows_ == 0) return; -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - CuRand tmp; - tmp.RandGaussian(this); - } else -#endif - { - Mat().SetRandn(); + Mat().Swap(smat); } } -template -void CuMatrixBase::SetRandUniform() { - if (num_rows_ == 0) return; +template +void CuSparseMatrix::Swap(CuSparseMatrix *smat) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - CuRand tmp; - tmp.RandUniform(this); - } else -#endif - { - Mat().SetRandUniform(); - } -} - -template -void Matrix::Swap(CuMatrix *mat) { mat->Swap(this); } -// instantiate the template above. -template void Matrix::Swap(CuMatrix *mat); -template void Matrix::Swap(CuMatrix *mat); - -/// Copy constructor from another type. -template -template -CuMatrix::CuMatrix(const CuMatrixBase & M, - MatrixTransposeType trans) : CuMatrixBase() { - - if (trans == kNoTrans) { - Resize(M.NumRows(), M.NumCols()); - this->CopyFromMat(M); - } else { - Resize(M.NumCols(), M.NumRows()); - this->CopyFromMat(M, kTrans); - } - -} - -// Instantiate this constructor for float->double and double->float. -template -CuMatrix::CuMatrix(const CuMatrixBase & M, - MatrixTransposeType trans); -template -CuMatrix::CuMatrix(const CuMatrixBase & M, - MatrixTransposeType trans); - - -template -void CuMatrix::Transpose() { - if (this->num_rows_ == 0) - return; -#if HAVE_CUDA == 1 - if (this->num_rows_ == this->num_cols_ && CuDevice::Instantiate().Enabled()) { - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - // (x,y) indices will be (row of *this, col of *this) - dim3 dimGrid(n_blocks(this->num_rows_, CU2DBLOCK), - n_blocks(this->num_cols_, CU2DBLOCK)); - cuda_transpose_matrix(dimGrid, dimBlock, this->data_, this->Dim()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + CuArray > tmp(elements_); + elements_ = smat->elements_; + smat->elements_ = tmp; } else #endif { - CuMatrix tmp(*this, kTrans); - *this = tmp; + Mat().Swap(&(smat->Mat())); } } - -// Version of AddMatMat where 2nd argument is of type CuBlockMatrix. -template -void CuMatrixBase::AddMatBlock( - Real alpha, - const CuMatrixBase &A, MatrixTransposeType transA, - const CuBlockMatrix &B, MatrixTransposeType transB, - Real beta) { - // Check dimensions - int32 A_num_rows = A.NumRows(), A_num_cols = A.NumCols(), - A_row_stride = A.Stride(), A_col_stride = 1, - B_num_rows = B.NumRows(), B_num_cols = B.NumCols(); - if (transA == kTrans) { - std::swap(A_num_rows, A_num_cols); - std::swap(A_row_stride, A_col_stride); - } - if (transB == kTrans) { - std::swap(B_num_rows, B_num_cols); - } - // At this point the {A,B}_{rows,cols} variables are - // after any transposition. - KALDI_ASSERT(NumRows() == A_num_rows && NumCols() == B_num_cols); - KALDI_ASSERT(A_num_cols == B_num_rows); - int32 B_num_blocks = B.NumBlocks(); - +template +void CuSparseMatrix::SetRandn(BaseFloat zero_prob) { if (num_rows_ == 0) return; -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - MatrixDim this_dim = Dim(); - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - // (x,y) indices will be (row of *this, block of B) - dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK), - n_blocks(B_num_blocks, CU2DBLOCK)); - - cuda_add_mat_blockmat(dimGrid, dimBlock, data_, this_dim, A.Data(), - A_num_rows, A_num_cols, A_row_stride, A_col_stride, - B.CuData(), B_num_blocks, alpha, beta, - (transB == kTrans ? 1 : 0)); - - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - // "row_offset" and "col_offset" are offsets into B (or into B^T, if - // transB == kTrans). - int32 row_offset = 0, col_offset = 0; - for (int32 b = 0; b < B_num_blocks; b++) { - const CuSubMatrix this_block = B.Block(b); - int32 this_num_rows = this_block.NumRows(), - this_num_cols = this_block.NumCols(); - if (transB == kTrans) std::swap(this_num_rows, this_num_cols); - CuSubMatrix this_part(*this, 0, num_rows_, - col_offset, this_num_cols); - CuSubMatrix A_part = (transA == kNoTrans ? - CuSubMatrix(A, 0, num_rows_, - row_offset, this_num_rows) : - CuSubMatrix(A, row_offset, this_num_rows, - 0, num_rows_)); - this_part.AddMatMat(alpha, A_part, transA, this_block, transB, beta); - row_offset += this_num_rows; - col_offset += this_num_cols; - } - // Note: the values being compared below are all after applying any - // transposition to B. - KALDI_ASSERT(row_offset == B_num_rows && col_offset == B_num_cols); - } -} - -template -void CuMatrixBase::AddElements(Real alpha, - const std::vector >& input) { - // Checks the dimension. - MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_; - for (int32 i = 0; i < input.size(); ++i) { - KALDI_ASSERT(input[i].row < num_rows && input[i].row >= 0 && - input[i].column < num_cols && input[i].column >= 0); - } -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - void *addr = CuDevice::Instantiate().Malloc(input.size() * sizeof(MatrixElement)); - CU_SAFE_CALL(cudaMemcpy(addr, input.data(), - input.size() * sizeof(MatrixElement), - cudaMemcpyHostToDevice)); - - Timer tim; - int dimBlock(CU1DBLOCK); - int dimGrid = 1;// only 1 block here. we have loops in each thread //(n_blocks(dim_, CU1DBLOCK)); - - cuda_matrix_add_elements(dimGrid, dimBlock, this->data_, this->Dim(), - alpha, (MatrixElement*)addr, input.size()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().Free(addr); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - for (int32 i = 0; i < input.size(); i++) { - (*this)(input[i].row, input[i].column) += alpha * input[i].weight; - } - } -} - -template -void CuMatrixBase::Lookup(const std::vector &indices, - std::vector *output) const { - // Checks the dimension. - MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_; - for (int32 i = 0; i < indices.size(); ++i) { - KALDI_ASSERT(indices[i].first < num_rows && indices[i].first >= 0 && - indices[i].second < num_cols && indices[i].second >= 0); - } - - // Checks the pointer. - KALDI_ASSERT(output != NULL); - - // Resizes the output vector. - output->resize(indices.size()); - -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - CuArray cuda_indices(indices); - CuArray cuda_output(output->size()); - - Timer tim; - dim3 dimBlock(CU1DBLOCK, 1); - dim3 dimGrid(n_blocks(indices.size(), CU1DBLOCK), 1); - - cuda_matrix_lookup(dimGrid, dimBlock, this->data_, this->Dim(), - cuda_indices.Data(), indices.size(), cuda_output.Data()); - CU_SAFE_CALL(cudaGetLastError()); - - cuda_output.CopyToVec(output); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - for (int32 i = 0; i < indices.size(); i++) { - (*output)[i] = (*this)(indices[i].first, indices[i].second); - } - } + // Not efficient at the moment... + SparseMatrix tmp(num_rows_, num_cols_); + tmp.SetRandn(zero_prob); + Swap(&tmp); } -template -void CuMatrixBase::EqualElementMask(const CuMatrixBase &mat, CuMatrix *mask) const { - // Check the inputs: - KALDI_ASSERT(mat.NumRows() == NumRows() && mat.NumCols() == NumCols()); - KALDI_ASSERT(mask != NULL); - // Resizes the output matrix: - mask->Resize(NumRows(), NumCols(), kSetZero); - -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_equal_element_mask(dimGrid, dimBlock, this->data_, mat.Data(), mask->Data(), this->Dim(), mat.Stride(), mask->Stride()); - CU_SAFE_CALL(cudaGetLastError()); - - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - for (int32 r = 0; r < NumRows(); r++) { - for (int32 c = 0; c < NumCols(); c++) { - (*mask)(r,c) = ((*this)(r,c) == mat(r,c) ? 1.0 : 0.0); - } - } - } +template +void CuSparseMatrix::Write(std::ostream &os, bool binary) const { + SparseMatrix tmp(this->Mat()); + tmp.Write(os, binary); } - -/** - * Print the matrix to stream - */ -template -std::ostream &operator << (std::ostream &out, const CuMatrixBase &mat) { - Matrix temp(mat.NumRows(), mat.NumCols()); - mat.CopyToMat(&temp); - out << temp; - return out; +template +void CuSparseMatrix::Read(std::istream &is, bool binary) { + SparseMatrix tmp; + tmp.Read(is, binary); + Swap(&tmp); } -// instantiate the template -template -std::ostream &operator << (std::ostream &out, const CuMatrixBase &mat); -template -std::ostream &operator << (std::ostream &out, const CuMatrixBase &mat); - - -// Instantiate classes CuMatrix and CuMatrixBase for float and double. -template class CuMatrix; -template class CuMatrix; -template class CuMatrixBase; -template class CuMatrixBase; - - - - - +template class CuSparseMatrix; +template class CuSparseMatrix; } // namespace kaldi diff --git a/src/cudamatrix/cu-sparse-matrix.h b/src/cudamatrix/cu-sparse-matrix.h index 57612641220..b02326d48c3 100644 --- a/src/cudamatrix/cu-sparse-matrix.h +++ b/src/cudamatrix/cu-sparse-matrix.h @@ -1,6 +1,7 @@ // cudamatrix/cu-sparse-matrix.h // Copyright 2015 Johns Hopkins University (author: Daniel Povey) +// 2015 Guoguo Chen // See ../../COPYING for clarification regarding multiple authors // @@ -29,6 +30,7 @@ #include "cudamatrix/cu-value.h" #include "matrix/matrix-common.h" #include "matrix/kaldi-matrix.h" +#include "matrix/sparse-matrix.h" #include "cudamatrix/cu-array.h" #include "cudamatrix/cu-math.h" #include "cudamatrix/cu-rand.h" @@ -38,12 +40,25 @@ namespace kaldi { template class CuSparseMatrix { public: + MatrixIndexT NumRows() const { return num_rows_; } + + MatrixIndexT NumCols() const { return num_cols_; } + + MatrixIndexT NumElements() const { return elements_.Dim(); } + + // returns pointer to element data, or NULL if empty (use with NumElements()). + // This should only be called when CUDA is enabled. + MatrixElement *Data(); + + // returns pointer to element data, or NULL if empty (use with NumElements()), + // const version. This should only be called when CUDA is enabled. + const MatrixElement *Data() const; /// Copy from CPU-based matrix. - CuSparseMatrix &operator = (SparseMatrix &smat); + CuSparseMatrix &operator = (const SparseMatrix &smat); /// Copy from possibly-GPU-based matrix. - CuSparseMatrix &operator = (CuSparseMatrix &smat); + CuSparseMatrix &operator = (const CuSparseMatrix &smat); /// Swap with CPU-based matrix. void Swap(SparseMatrix *smat); @@ -58,13 +73,23 @@ class CuSparseMatrix { void Write(std::ostream &os, bool binary) const; - void Read(std::istream &os, bool binary); + void Read(std::istream &is, bool binary); // Constructor from CPU-based sparse matrix. CuSparseMatrix(const SparseMatrix &smat); ~CuSparseMatrix() { } + // The following two functions should only be called if we did not compile + // with CUDA or could not get a CUDA card; in that case the contents are + // interpreted the same as a regular sparse matrix. + inline const SparseMatrix &Mat() const { + return *(reinterpret_cast* >(this)); + } + inline SparseMatrix &Mat() { + return *(reinterpret_cast* >(this)); + } + // Use the CuMatrix::CopyFromSmat() function to copy from this to // CuMatrix. // Also see CuMatrix::AddSmat(). @@ -74,6 +99,9 @@ class CuSparseMatrix { // is not enabled. It needs to be first because we reinterpret_cast this std::vector > cpu_rows_; + MatrixIndexT num_rows_; + MatrixIndexT num_cols_; + // This is where the data lives if we are using a GPU. Notice that the format // is a little different from on CPU, as there is only one list, of matrix // elements, instead of a list for each row. This is better suited to diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc index 79beb0d8c1e..bd15b28864a 100644 --- a/src/matrix/kaldi-matrix.cc +++ b/src/matrix/kaldi-matrix.cc @@ -1856,6 +1856,18 @@ void MatrixBase::CopyFromSmat(const SparseMatrix &mat, MatrixTransposeType trans) { mat.CopyToMat(this, trans); } +template +void MatrixBase::CopyFromSmat(const SparseMatrix &mat, + MatrixTransposeType trans); +template +void MatrixBase::CopyFromSmat(const SparseMatrix &mat, + MatrixTransposeType trans); +template +void MatrixBase::CopyFromSmat(const SparseMatrix &mat, + MatrixTransposeType trans); +template +void MatrixBase::CopyFromSmat(const SparseMatrix &mat, + MatrixTransposeType trans); template diff --git a/src/matrix/sparse-matrix.cc b/src/matrix/sparse-matrix.cc index 4bf7faf8904..bf7a23939d9 100644 --- a/src/matrix/sparse-matrix.cc +++ b/src/matrix/sparse-matrix.cc @@ -232,6 +232,19 @@ MatrixIndexT SparseMatrix::NumCols() const { return rows_[0].Dim(); } +template +SparseVector* SparseMatrix::Data() { + if (rows_.empty()) return NULL; + else return rows_.data(); +} + +template +const SparseVector* SparseMatrix::Data() const { + if (rows_.empty()) return NULL; + else return rows_.data(); +} + + template template void SparseMatrix::CopyToMat(MatrixBase *other, diff --git a/src/matrix/sparse-matrix.h b/src/matrix/sparse-matrix.h index d57bc8fa858..74df9882521 100644 --- a/src/matrix/sparse-matrix.h +++ b/src/matrix/sparse-matrix.h @@ -119,6 +119,13 @@ class SparseMatrix { void Swap(SparseMatrix *other); + // returns pointer to element data, or NULL if empty (use with NumElements()). + SparseVector *Data(); + + // returns pointer to element data, or NULL if empty (use with NumElements()); + // const version + const SparseVector *Data() const; + // initializer from the type that elsewhere in Kaldi is referred to as type Posterior. // indexed first by row-index; the pairs are (column-index, value), and the constructor // does not require them to be sorted and uniq.