taichi-dev · qiao-bo · Jan 3, 2022 · Dec 31, 2021 · Dec 31, 2021 · Dec 31, 2021
diff --git a/python/taichi/lang/_ndarray.py b/python/taichi/lang/_ndarray.py
@@ -74,35 +74,27 @@ def __getitem__(self, key):
         """
         raise NotImplementedError()
 
-    def ndarray_fill(self, val):
+    @python_scope
+    def fill(self, val):
         """Fills ndarray with a specific scalar value.
 
         Args:
             val (Union[int, float]): Value to fill.
         """
         if impl.current_cfg().ndarray_use_torch:
             self.arr.fill_(val)
-        elif impl.current_cfg().arch != _ti_core.Arch.cuda:
-            taichi.lang.meta.fill_ndarray(self, val)
+        elif impl.current_cfg(
+        ).arch != _ti_core.Arch.cuda and impl.current_cfg(
+        ).arch != _ti_core.Arch.x64:
+            self.fill_by_kernel(val)
         elif self.dtype == primitive_types.f32:
             self.arr.fill_float(val)
         elif self.dtype == primitive_types.i32:
             self.arr.fill_int(val)
         elif self.dtype == primitive_types.u32:
             self.arr.fill_uint(val)
         else:
-            taichi.lang.meta.fill_ndarray(self, val)
-
-    def ndarray_matrix_fill(self, val):
-        """Fills ndarray with a specific scalar value.
-
-        Args:
-            val (Union[int, float]): Value to fill.
-        """
-        if impl.current_cfg().ndarray_use_torch:
-            self.arr.fill_(val)
-        else:
-            taichi.lang.meta.fill_ndarray_matrix(self, val)
+            self.fill_by_kernel(val)
 
     def ndarray_to_numpy(self):
         """Converts ndarray to a numpy array.
@@ -218,6 +210,14 @@ def __deepcopy__(self, memo=None):
         """
         raise NotImplementedError()
 
+    def fill_by_kernel(self, val):
+        """Fills ndarray with a specific scalar value using a ti.kernel.
+
+        Args:
+            val (Union[int, float]): Value to fill.
+        """
+        raise NotImplementedError()
+
     def pad_key(self, key):
         if key is None:
             key = ()
@@ -263,10 +263,6 @@ def __getitem__(self, key):
         self.initialize_host_accessor()
         return self.host_accessor.getter(*self.pad_key(key))
 
-    @python_scope
-    def fill(self, val):
-        self.ndarray_fill(val)
-
     @python_scope
     def to_numpy(self):
         return self.ndarray_to_numpy()
@@ -280,6 +276,9 @@ def __deepcopy__(self, memo=None):
         ret_arr.copy_from(self)
         return ret_arr
 
+    def fill_by_kernel(self, val):
+        taichi.lang.meta.fill_ndarray(self, val)
+
     def __repr__(self):
         return '<ti.ndarray>'
 

diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
@@ -1348,10 +1348,6 @@ def __getitem__(self, key):
             [[NdarrayHostAccess(self, key, (i, j)) for j in range(self.m)]
              for i in range(self.n)])
 
-    @python_scope
-    def fill(self, val):
-        self.ndarray_matrix_fill(val)
-
     @python_scope
     def to_numpy(self):
         return self.ndarray_matrix_to_numpy(as_vector=0)
@@ -1366,6 +1362,9 @@ def __deepcopy__(self, memo=None):
         ret_arr.copy_from(self)
         return ret_arr
 
+    def fill_by_kernel(self, val):
+        taichi.lang.meta.fill_ndarray_matrix(self, val)
+
     def __repr__(self):
         return f'<{self.n}x{self.m} {self.layout} ti.Matrix.ndarray>'
 
@@ -1405,10 +1404,6 @@ def __getitem__(self, key):
         return Vector(
             [NdarrayHostAccess(self, key, (i, )) for i in range(self.n)])
 
-    @python_scope
-    def fill(self, val):
-        self.ndarray_matrix_fill(val)
-
     @python_scope
     def to_numpy(self):
         return self.ndarray_matrix_to_numpy(as_vector=1)
@@ -1422,5 +1417,8 @@ def __deepcopy__(self, memo=None):
         ret_arr.copy_from(self)
         return ret_arr
 
+    def fill_by_kernel(self, val):
+        taichi.lang.meta.fill_ndarray_matrix(self, val)
+
     def __repr__(self):
         return f'<{self.n} {self.layout} ti.Vector.ndarray>'
diff --git a/taichi/backends/cpu/cpu_device.cpp b/taichi/backends/cpu/cpu_device.cpp
@@ -5,7 +5,7 @@ namespace lang {
 
 namespace cpu {
 
-CpuDevice::AllocInfo CpuDevice::get_alloc_info(DeviceAllocation handle) {
+CpuDevice::AllocInfo CpuDevice::get_alloc_info(const DeviceAllocation handle) {
   validate_device_alloc(handle);
   return allocations_[handle.alloc_id];
 }

diff --git a/taichi/backends/cpu/cpu_device.h b/taichi/backends/cpu/cpu_device.h
@@ -83,7 +83,7 @@ class CpuDevice : public Device {
     bool use_cached{false};
   };
 
-  AllocInfo get_alloc_info(DeviceAllocation handle);
+  AllocInfo get_alloc_info(const DeviceAllocation handle);
 
   ~CpuDevice() override{};
 
@@ -116,7 +116,7 @@ class CpuDevice : public Device {
   std::unordered_map<int, std::unique_ptr<VirtualMemoryAllocator>>
       virtual_memories_;
 
-  void validate_device_alloc(DeviceAllocation alloc) {
+  void validate_device_alloc(const DeviceAllocation alloc) {
     if (allocations_.size() <= alloc.alloc_id) {
       TI_ERROR("invalid DeviceAllocation");
     }

diff --git a/taichi/backends/cuda/cuda_device.cpp b/taichi/backends/cuda/cuda_device.cpp
@@ -5,7 +5,8 @@ namespace lang {
 
 namespace cuda {
 
-CudaDevice::AllocInfo CudaDevice::get_alloc_info(DeviceAllocation handle) {
+CudaDevice::AllocInfo CudaDevice::get_alloc_info(
+    const DeviceAllocation handle) {
   validate_device_alloc(handle);
   return allocations_[handle.alloc_id];
 }

diff --git a/taichi/backends/cuda/cuda_device.h b/taichi/backends/cuda/cuda_device.h
@@ -95,7 +95,7 @@ class CudaDevice : public Device {
     bool use_cached{false};
   };
 
-  AllocInfo get_alloc_info(DeviceAllocation handle);
+  AllocInfo get_alloc_info(const DeviceAllocation handle);
 
   ~CudaDevice() override{};
 
@@ -125,7 +125,7 @@ class CudaDevice : public Device {
 
  private:
   std::vector<AllocInfo> allocations_;
-  void validate_device_alloc(DeviceAllocation alloc) {
+  void validate_device_alloc(const DeviceAllocation alloc) {
     if (allocations_.size() <= alloc.alloc_id) {
       TI_ERROR("invalid DeviceAllocation");
     }

diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
@@ -593,7 +593,8 @@ std::shared_ptr<Device> LlvmProgramImpl::get_device_shared() {
   return device_;
 }
 
-uint64_t *LlvmProgramImpl::get_ndarray_alloc_info_ptr(DeviceAllocation &alloc) {
+uint64_t *LlvmProgramImpl::get_ndarray_alloc_info_ptr(
+    const DeviceAllocation &alloc) {
   if (config->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
     return (uint64_t *)cuda_device()->get_alloc_info(alloc).ptr;
@@ -605,16 +606,18 @@ uint64_t *LlvmProgramImpl::get_ndarray_alloc_info_ptr(DeviceAllocation &alloc) {
   }
 }
 
-void LlvmProgramImpl::fill_ndarray(DeviceAllocation &alloc,
+void LlvmProgramImpl::fill_ndarray(const DeviceAllocation &alloc,
                                    std::size_t size,
                                    uint32_t data) {
+  auto ptr = get_ndarray_alloc_info_ptr(alloc);
   if (config->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
-    auto ptr = get_ndarray_alloc_info_ptr(alloc);
     CUDADriver::get_instance().memsetd32((void *)ptr, data, size);
 #else
     TI_NOT_IMPLEMENTED
 #endif
+  } else {
+    std::fill((uint32_t *)ptr, (uint32_t *)ptr + size, data);
   }
 }
 }  // namespace lang

diff --git a/taichi/llvm/llvm_program.h b/taichi/llvm/llvm_program.h
@@ -103,11 +103,13 @@ class LlvmProgramImpl : public ProgramImpl {
   DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size,
                                            uint64 *result_buffer) override;
 
-  uint64_t *get_ndarray_alloc_info_ptr(DeviceAllocation &alloc);
+  uint64_t *get_ndarray_alloc_info_ptr(const DeviceAllocation &alloc);
 
   std::shared_ptr<Device> get_device_shared() override;
 
-  void fill_ndarray(DeviceAllocation &alloc, std::size_t size, uint32_t data);
+  void fill_ndarray(const DeviceAllocation &alloc,
+                    std::size_t size,
+                    uint32_t data);
 
  private:
   std::unique_ptr<llvm::Module> clone_struct_compiler_initial_context(

diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py
@@ -319,16 +319,22 @@ def test_ndarray_cuda_caching_allocator():
 def test_ndarray_fill():
     n = 8
     a = ti.ndarray(ti.i32, shape=(n))
+    anp = np.ones((n, ), dtype=np.int32)
     a.fill(2)
-    assert (a[4] == 2)
-
-    a = ti.ndarray(ti.f32, shape=(n))
-    a.fill(2.5)
-    assert (a[4] == 2.5)
-
-    a = ti.ndarray(ti.u32, shape=(n))
-    a.fill(0)
-    assert (a[4] == 0)
+    anp.fill(2)
+    assert (a.to_numpy() == anp).all()
+
+    b = ti.Vector.ndarray(4, ti.f32, shape=(n))
+    bnp = np.ones(shape=b.arr.shape, dtype=np.float32)
+    b.fill(2.5)
+    bnp.fill(2.5)
+    assert (b.to_numpy() == bnp).all()
+
+    c = ti.Matrix.ndarray(4, 4, ti.f32, shape=(n))
+    cnp = np.ones(shape=c.arr.shape, dtype=np.float32)
+    c.fill(1.5)
+    cnp.fill(1.5)
+    assert (c.to_numpy() == cnp).all()
 
 
 @ti.test(arch=supported_archs_taichi_ndarray)