Merge pull request #36 from chenguoguo/guoguo-nnet3

intermedia work on sparse matrix, nnet3bin now compiles again
kaldi-asr · Jul 30, 2015 · b1a372d · b1a372d
2 parents 53baf84 + 2e9f6f5
commit b1a372d
Show file tree

Hide file tree

Showing 12 changed files with 260 additions and 2,414 deletions.
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
@@ -16,7 +16,8 @@ TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test
 
 
 OBJFILES = cu-device.o cu-math.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \
-           cu-vector.o cu-common.o cu-tp-matrix.o cu-rand.o cu-block-matrix.o
+           cu-vector.o cu-common.o cu-tp-matrix.o cu-rand.o cu-block-matrix.o \
+           cu-sparse-matrix.o
 ifeq ($(CUDA), true)
   OBJFILES += cu-kernels.o cu-randkernels.o
 endif

diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
@@ -303,6 +303,15 @@ void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out, const float*
 void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in);
 void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in);
 
+void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement<float>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_fd(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement<double>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_df(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement<float>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_dd(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement<double>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement<float>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_fd_trans(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement<double>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_df_trans(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement<float>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_dd_trans(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement<double>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in);
+
 void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, double alpha, MatrixElement<double>* x, int s);
 void cudaD_comp_obj_deriv(dim3 Gr,dim3 Bl, MatrixElement<double>* x, int s, const double* z, MatrixDim d, double* z2, MatrixDim d2, double* t);
 

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
@@ -254,6 +254,24 @@ static void _copy_from_mat_trans(Real* mat_out, const OtherReal* mat_in, MatrixD
     mat_out[index_out] = static_cast<Real>(mat_in[index_in]);
 }
 
+template<typename Real, typename OtherReal>
+__global__
+static void _copy_from_smat(Real* mat_out, const MatrixElement<OtherReal>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  int smat_index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (smat_index >= d_in) return;
+  int data_index = smat_in[smat_index].row * d_out.stride + smat_in[smat_index].column;
+  mat_out[data_index] = smat_in[smat_index].weight;
+}
+
+template<typename Real, typename OtherReal>
+__global__
+static void _copy_from_smat_trans(Real* mat_out, const MatrixElement<OtherReal>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  int smat_index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (smat_index >= d_in) return;
+  int data_index = smat_in[smat_index].column * d_out.stride + smat_in[smat_index].row;
+  mat_out[data_index] = smat_in[smat_index].weight;
+}
+
 template<typename Real>
 __global__
 static void _transpose_matrix(Real* mat, MatrixDim d) {
@@ -2907,3 +2925,27 @@ void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out, const double
   _copy_from_mat_trans<<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
 }
 
+void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement<float>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  _copy_from_smat<<<Gr,Bl>>>(mat_out, smat_in, d_out, d_in);
+}
+void cuda_copy_from_smat_fd(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement<double>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  _copy_from_smat<<<Gr,Bl>>>(mat_out, smat_in, d_out, d_in);
+}
+void cuda_copy_from_smat_df(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement<float>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  _copy_from_smat<<<Gr,Bl>>>(mat_out, smat_in, d_out, d_in);
+}
+void cuda_copy_from_smat_dd(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement<double>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  _copy_from_smat<<<Gr,Bl>>>(mat_out, smat_in, d_out, d_in);
+}
+void cuda_copy_from_smat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement<float>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  _copy_from_smat_trans<<<Gr,Bl>>>(mat_out, smat_in, d_out, d_in);
+}
+void cuda_copy_from_smat_fd_trans(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement<double>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  _copy_from_smat_trans<<<Gr,Bl>>>(mat_out, smat_in, d_out, d_in);
+}
+void cuda_copy_from_smat_df_trans(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement<float>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  _copy_from_smat_trans<<<Gr,Bl>>>(mat_out, smat_in, d_out, d_in);
+}
+void cuda_copy_from_smat_dd_trans(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement<double>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  _copy_from_smat_trans<<<Gr,Bl>>>(mat_out, smat_in, d_out, d_in);
+}
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
@@ -82,6 +82,32 @@ inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out, const fl
   cuda_copy_from_mat_df_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
 }
 
+inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement<float>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_ff(Gr, Bl, mat_out, smat_in, d_out, d_in);
+}
+inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement<double>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_fd(Gr, Bl, mat_out, smat_in, d_out, d_in);
+}
+inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement<float>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_df(Gr, Bl, mat_out, smat_in, d_out, d_in);
+}
+inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement<double>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_dd(Gr, Bl, mat_out, smat_in, d_out, d_in);
+}
+
+inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement<float>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_ff_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
+}
+inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement<double>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_fd_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
+}
+inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement<float>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_df_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
+}
+inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement<double>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_dd_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
+}
+
 inline void cuda_copy_col_from_vec(int Gr, int Bl, float* mat, const float* v, int col, MatrixDim d) { cudaF_copy_col_from_vec(Gr,Bl,mat,v,col,d); }
 inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_apply_exp(Gr,Bl,mat,d); }
 inline void cuda_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim dim) { cudaF_apply_pow(Gr,Bl,mat,power,dim); }

diff --git a/src/cudamatrix/cu-matrix-lib.h b/src/cudamatrix/cu-matrix-lib.h
@@ -26,6 +26,7 @@
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-sp-matrix.h"
 #include "cudamatrix/cu-tp-matrix.h"
+#include "cudamatrix/cu-sparse-matrix.h"
 #include "cudamatrix/cu-block-matrix.h"
 #include "cudamatrix/cu-rand.h"
 

diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
@@ -39,7 +39,6 @@
 #include "cudamatrix/cu-sp-matrix.h"
 #include "cudamatrix/cu-tp-matrix.h"
 #include "cudamatrix/cu-block-matrix.h"
-#include "cudamatrix/cu-sparse-matrix.h"
 #include "cudamatrix/cublas-wrappers.h"
 
 namespace kaldi {
@@ -255,6 +254,53 @@ template
 void CuMatrixBase<double>::CopyFromMat<double>(const CuMatrixBase<double> &M,
                                                MatrixTransposeType Trans);
 
+template <typename Real>
+template <typename OtherReal>
+void CuMatrixBase<Real>::CopyFromSmat(const CuSparseMatrix<OtherReal> &M,
+                                      MatrixTransposeType trans) {
+  // Sanity check.
+  if (trans == kNoTrans) {
+    KALDI_ASSERT(M.NumRows() == num_rows_ && M.NumCols() == num_cols_);
+  } else {
+    KALDI_ASSERT(M.NumCols() == num_rows_ && M.NumRows() == num_cols_);
+  }
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    dim3 dimBlock(CU1DBLOCK, 1);
+    dim3 dimGrid(n_blocks(M.NumElements(), CU1DBLOCK), 1);
+    if (trans == kNoTrans) {
+      cuda_copy_from_smat(dimGrid, dimBlock, this->data_,
+                          M.Data(), this->Dim(), M.NumElements());
+    } else {
+      cuda_copy_from_smat_trans(dimGrid, dimBlock, this->data_,
+                                M.Data(), this->Dim(), M.NumElements());
+    }
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());    
+  } else
+#endif
+  {
+    Mat().CopyFromSmat(M.Mat(), trans);
+  }
+}
+
+// Instantiate the template above.
+template
+void CuMatrixBase<float>::CopyFromSmat<float>(const CuSparseMatrix<float> &M,
+                                              MatrixTransposeType trans);
+
+template
+void CuMatrixBase<float>::CopyFromSmat<double>(const CuSparseMatrix<double> &M,
+                                               MatrixTransposeType trans);
+
+template
+void CuMatrixBase<double>::CopyFromSmat<float>(const CuSparseMatrix<float> &M,
+                                               MatrixTransposeType trans);
+
+template
+void CuMatrixBase<double>::CopyFromSmat<double>(const CuSparseMatrix<double> &M,
+                                                MatrixTransposeType trans);
+
 template<typename Real>
 template<typename OtherReal>
 void CuMatrixBase<Real>::CopyFromTp(const CuTpMatrix<OtherReal> &M,
@@ -2240,9 +2286,6 @@ void CuMatrixBase<Real>::CopyFromGeneralMat(const GeneralMatrix &src,
         return;
       }
 #endif
-      Matrix<BaseFloat> mat(trans == kNoTrans ? smat.NumRows() : smat.NumCols(),
-                            trans == kNoTrans ? smat.NumCols() : smat.NumRows(),
-                            kUndefined);
       Mat().CopyFromSmat(smat, trans);
       return;
     }

diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
@@ -31,6 +31,7 @@
 #include "cudamatrix/cu-matrixdim.h"
 #include "cudamatrix/cu-common.h"
 #include "cudamatrix/cu-value.h"
+#include "cudamatrix/cu-sparse-matrix.h"
 #include "matrix/matrix-common.h"
 #include "matrix/kaldi-matrix.h"
 #include "matrix/sparse-matrix.h"