parallel101 · Sduby22 · Apr 26, 2022 · Apr 26, 2022 · Apr 26, 2022 · Apr 26, 2022
diff --git a/ANSWER.md b/ANSWER.md
@@ -1,23 +1,45 @@
 # 改进前
 
 ```
-这里贴改进前的运行结果。
-matrix_randomize: 100s
+...
+t=2: n=1024
+matrix_randomize: 0.00138448s
+matrix_randomize: 0.00140546s
+matrix_transpose: 0.00400085s
+matrix_multiply: 1.32128s
+matrix_multiply: 1.37719s
+matrix_RtAR: 2.7027s
+matrix_trace: 6.573e-06s
+1.34324e+08
+test_func: 2.70884s
+overall: 11.703s
+...
 ```
 
 # 改进后
 
 ```
-这里贴改进后的运行结果。
-matrix_randomize: 0.01s
+t=2: n=1024
+matrix_randomize: 0.000223787s
+matrix_randomize: 0.000234784s
+matrix_transpose: 0.000950278s
+matrix_multiply: 0.0211322s
+matrix_multiply: 0.0236634s
+matrix_RtAR: 0.045924s
+matrix_trace: 6.171e-06s
+1.34277e+08
+test_func: 0.0502626s
+...
+overall: 0.228116s
+
 ```
 
 # 加速比
 
-matrix_randomize: 10000x
-matrix_transpose: 10000x
-matrix_multiply: 10000x
-matrix_RtAR: 10000x
+matrix_randomize: 6.18x
+matrix_transpose: 4.21x
+matrix_multiply: 62.52x
+matrix_RtAR: 58.85x
 
 > 如果记录了多种优化方法，可以做表格比较
 
@@ -27,19 +49,29 @@ matrix_RtAR: 10000x
 
 > matrix_randomize
 
-请回答。
+- 在原函数中，yx序的矩阵使用xy序遍历。交换xy遍历顺序即可优化为顺序访问。(O3优化似乎自动帮你做了)
+- 可以使用`_mm_stream_si32`绕过缓存写入。
+- 也可以使用`_mm_stream_ps`绕过缓存写入，但这要求首先计算4次`random`，有可能变成CPU-bound. 或者可以设计一个每次输出一个128比特向量的random.
+  - 如果数组大小不是4的倍数，边界需要特殊处理，或者申请数组时向外扩张4个float以免越界。
 
 > matrix_transpose
 
-请回答。
+- 可以使用简单的分块循环，每次转置一个直径为64的块
+  - 需要注意越界问题，需要边缘扩展64字节
+- (未使用)可以使用莫顿码遍历，但由于注意到矩阵直径不是2的整数幂所以可以使用tbb的`simple_partitioner`自带的莫顿码遍历
+- 使用`_mm_stream_si32`绕过缓存写入
 
 > matrix_multiply
 
-请回答。
+- 需要手动初始化，因为无法保证Matrix全为0，因为它是作为参数传入而不是作为临时变量构造的。
+  - 但是可以使用memset进行优化
+- 首先可以交换t和x的遍历顺序，因为赋值语句是`out(x, y) += lhs(x, t) * rhs(t, y);`让t和y不要动，否则就是跳着遍历效率低。
+- 经测试，在我的电脑上如果进行分块会导致更慢...
 
 > matrix_RtAR
 
-请回答。
+- 这两个是临时变量，有什么可以优化的？ 5 分
+- 使用`static`关键字简单池化避免重复分配销毁。
 
 # 我的创新点
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -11,8 +11,8 @@ add_executable(main main.cpp)
 find_package(OpenMP REQUIRED)
 target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX)
 
-#find_package(TBB REQUIRED)
-#target_link_libraries(main PUBLIC TBB::tbb)
+find_package(TBB REQUIRED)
+target_link_libraries(main PUBLIC TBB::tbb)
 
 if (MSVC)
     target_compile_options(main PUBLIC /fp:fast /arch:AVX)

diff --git a/main.cpp b/main.cpp
@@ -7,15 +7,17 @@
 // 作业中有很多个问句，请通过注释回答问题，并改进其代码，以使其更快
 // 并行可以用 OpenMP 也可以用 TBB
 
+#include <cstring>
 #include <iostream>
-//#include <x86intrin.h>  // _mm 系列指令都来自这个头文件
+#include <x86intrin.h>  // _mm 系列指令都来自这个头文件
 //#include <xmmintrin.h>  // 如果上面那个不行，试试这个
+#include "alignalloc.h"
 #include "ndarray.h"
 #include "wangsrng.h"
 #include "ticktock.h"
 
 // Matrix 是 YX 序的二维浮点数组：mat(x, y) = mat.data()[y * mat.shape(0) + x]
-using Matrix = ndarray<2, float>;
+using Matrix = ndarray<2, float, 64>;
 // 注意：默认对齐到 64 字节，如需 4096 字节，请用 ndarray<2, float, AlignedAllocator<4096, float>>
 
 static void matrix_randomize(Matrix &out) {
@@ -24,11 +26,22 @@ static void matrix_randomize(Matrix &out) {
     size_t ny = out.shape(1);
 
     // 这个循环为什么不够高效？如何优化？ 10 分
+// 不是顺序写入所以低效
+// - 在原函数中，yx序的矩阵使用xy序遍历。交换xy遍历顺序即可优化为顺序访问。(O3优化似乎自动帮你做了)
+// - 可以使用`_mm_stream_si32`绕过缓存写入。
+// - 也可以使用`_mm_stream_ps`绕过缓存写入，但这要求首先计算4次`random`，有可能变成CPU-bound. 或者可以设计一个每次输出一个128比特向量的random.
+//   - 如果数组大小不是4的倍数，边界需要特殊处理，或者申请数组时向外扩张4个float以免越界。
 #pragma omp parallel for collapse(2)
-    for (int x = 0; x < nx; x++) {
-        for (int y = 0; y < ny; y++) {
-            float val = wangsrng(x, y).next_float();
-            out(x, y) = val;
+    for (int y = 0; y < ny; y++) {
+        for (int x = 0; x < nx; x+=4) {
+            float val[4];
+            for (int i = 0; i != 4; i++)
+                val[i] = wangsrng(x, y).next_float();
+            _mm_stream_ps(&out(x,y), _mm_load_ps(val));
+        // for (int x = 0; x < nx; x++) {
+        //     float val = wangsrng(x, y).next_float();
+        //     _mm_stream_si32((int*)&out(x, y), *(int*)&val);
+        //     out(x, y) = val;
         }
     }
     TOCK(matrix_randomize);
@@ -41,10 +54,19 @@ static void matrix_transpose(Matrix &out, Matrix const &in) {
     out.reshape(ny, nx);
 
     // 这个循环为什么不够高效？如何优化？ 15 分
+// 主要的问题out和in总有一个不是顺序访问所以造成大量cache miss. 可以通过分块遍历优化。
+// - 可以使用简单的分块循环，每次转置一个直径为64的块
+//   - 需要注意越界问题，需要边缘扩展64字节
+// - (未使用)可以使用莫顿码遍历，但由于注意到矩阵直径不是2的整数幂所以可以使用tbb的`simple_partitioner`自带的莫顿码遍历
+// - 使用`_mm_stream_si32`绕过缓存写入
 #pragma omp parallel for collapse(2)
-    for (int x = 0; x < nx; x++) {
-        for (int y = 0; y < ny; y++) {
-            out(y, x) = in(x, y);
+    for (int yBase = 0; yBase < ny; yBase+=64) {
+        for (int xBase = 0; xBase < nx; xBase+=64) {
+            for (int y = yBase; y != yBase + 64; y++) {
+                for (int x = xBase; x != xBase + 64; x++) {
+                    _mm_stream_si32((int*)&out(x, y), *(int*)&in(y, x));
+                }
+            }
         }
     }
     TOCK(matrix_transpose);
@@ -62,11 +84,13 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) {
     out.reshape(nx, ny);
 
     // 这个循环为什么不够高效？如何优化？ 15 分
-#pragma omp parallel for collapse(2)
+    // 由于遍历顺序不佳所以不是顺序访问，可以通过改变t和x的遍历顺序解决
+#pragma omp parallel for 
     for (int y = 0; y < ny; y++) {
-        for (int x = 0; x < nx; x++) {
-            out(x, y) = 0;  // 有没有必要手动初始化？ 5 分
-            for (int t = 0; t < nt; t++) {
+        // 需要手动初始化，因为无法保证Matrix全为0，因为它是作为参数传入而不是作为临时变量构造的。
+        memset(&out(0, y), 0, sizeof(float) * (nx));
+        for (int t = 0; t < nt; t++) {
+            for (int x = 0; x < nx; x++) {
                 out(x, y) += lhs(x, t) * rhs(t, y);
             }
         }
@@ -78,7 +102,8 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) {
 static void matrix_RtAR(Matrix &RtAR, Matrix const &R, Matrix const &A) {
     TICK(matrix_RtAR);
     // 这两个是临时变量，有什么可以优化的？ 5 分
-    Matrix Rt, RtA;
+    // 简单池化避免重复分配销毁。
+    static Matrix Rt, RtA;
     matrix_transpose(Rt, R);
     matrix_multiply(RtA, Rt, A);
     matrix_multiply(RtAR, RtA, R);