From a0882a6ef0597697871fb23eaa84ca8d9f74040d Mon Sep 17 00:00:00 2001
From: nihui <shuizhuyuanluo@126.com>
Date: Wed, 16 Feb 2022 22:44:07 +0800
Subject: [PATCH] add syncgap option, use rough syncgap mode by default

---
 README.md         |  6 +++--
 src/main.cpp      | 25 ++++++++++++++-----
 src/realcugan.cpp | 63 +++++++++++++++++++++++++++++++++++++++++++++--
 src/realcugan.h   |  4 +++
 4 files changed, 88 insertions(+), 10 deletions(-)
diff --git a/README.md b/README.md
index e23078a..169177c 100644
--- a/README.md
+++ b/README.md
@@ -5,13 +5,13 @@
 ![CI](https://github.com/nihui/realcugan-ncnn-vulkan/workflows/CI/badge.svg)
 ![download](https://img.shields.io/github/downloads/nihui/realcugan-ncnn-vulkan/total.svg)
 
-ncnn implementation of Real-CUGAN converter. Runs fast on Intel / AMD / Nvidia with Vulkan API.
+ncnn implementation of Real-CUGAN converter. Runs fast on Intel / AMD / Nvidia / Apple-Silicon with Vulkan API.
 
 realcugan-ncnn-vulkan uses [ncnn project](https://github.com/Tencent/ncnn) as the universal neural network inference framework.
 
 ## [Download](https://github.com/nihui/realcugan-ncnn-vulkan/releases)
 
-Download Windows/Linux/MacOS Executable for Intel/AMD/Nvidia GPU
+Download Windows/Linux/MacOS Executable for Intel/AMD/Nvidia/Apple-Silicon GPU
 
 **https://github.com/nihui/realcugan-ncnn-vulkan/releases**
 
@@ -43,6 +43,7 @@ Usage: realcugan-ncnn-vulkan -i infile -o outfile [options]...
   -n noise-level       denoise level (-1/0/1/2/3, default=-1)
   -s scale             upscale ratio (1/2/3/4, default=2)
   -t tile-size         tile size (>=32/0=auto, default=0) can be 0,0,0 for multi-gpu
+  -c syncgap-mode      sync gap mode (0/1/2, default=2)
   -m model-path        realcugan model path (default=models-se)
   -g gpu-id            gpu device to use (-1=cpu, default=auto) can be 0,1,2 for multi-gpu
   -j load:proc:save    thread count for load/proc/save (default=1:2:2) can be 1:2,2,2:2 for multi-gpu
@@ -54,6 +55,7 @@ Usage: realcugan-ncnn-vulkan -i infile -o outfile [options]...
 - `noise-level` = noise level, large value means strong denoise effect, -1 = no effect
 - `scale` = scale level, 1 = no scaling, 2 = upscale 2x
 - `tile-size` = tile size, use smaller value to reduce GPU memory usage, default selects automatically
+- `syncgap-mode` = sync gap mode, 0 = no sync, 1 = accurate sync, 2 = rough sync
 - `load:proc:save` = thread count for the three stages (image decoding + realcugan upscaling + image encoding), using larger values may increase GPU usage and consume more GPU memory. You can tune this configuration with "4:4:4" for many small-size images, and "2:2:2" for large-size images. The default setting usually works fine for most situations. If you find that your GPU is hungry, try increasing thread count to achieve faster processing.
 - `format` = the format of the image to be output, png is better supported, however webp generally yields smaller file sizes, both are losslessly encoded
 
diff --git a/src/main.cpp b/src/main.cpp
index 2e2e1d1..8d01721 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -108,6 +108,7 @@ static void print_usage()
     fprintf(stdout, "  -n noise-level       denoise level (-1/0/1/2/3, default=-1)\n");
     fprintf(stdout, "  -s scale             upscale ratio (1/2/3/4, default=2)\n");
     fprintf(stdout, "  -t tile-size         tile size (>=32/0=auto, default=0) can be 0,0,0 for multi-gpu\n");
+    fprintf(stdout, "  -c syncgap-mode      sync gap mode (0/1/2, default=2)\n");
     fprintf(stdout, "  -m model-path        realcugan model path (default=models-se)\n");
     fprintf(stdout, "  -g gpu-id            gpu device to use (-1=cpu, default=auto) can be 0,1,2 for multi-gpu\n");
     fprintf(stdout, "  -j load:proc:save    thread count for load/proc/save (default=1:2:2) can be 1:2,2,2:2 for multi-gpu\n");
@@ -447,13 +448,14 @@ int main(int argc, char** argv)
     std::vector<int> jobs_proc;
     int jobs_save = 2;
     int verbose = 0;
+    int syncgap = 2;
     int tta_mode = 0;
     path_t format = PATHSTR("png");
 
 #if _WIN32
     setlocale(LC_ALL, "");
     wchar_t opt;
-    while ((opt = getopt(argc, argv, L"i:o:n:s:t:m:g:j:f:vxh")) != (wchar_t)-1)
+    while ((opt = getopt(argc, argv, L"i:o:n:s:t:c:m:g:j:f:vxh")) != (wchar_t)-1)
     {
         switch (opt)
         {
@@ -472,6 +474,9 @@ int main(int argc, char** argv)
         case L't':
             tilesize = parse_optarg_int_array(optarg);
             break;
+        case L'c':
+            syncgap = _wtoi(optarg);
+            break;
         case L'm':
             model = optarg;
             break;
@@ -499,7 +504,7 @@ int main(int argc, char** argv)
     }
 #else // _WIN32
     int opt;
-    while ((opt = getopt(argc, argv, "i:o:n:s:t:m:g:j:f:vxh")) != -1)
+    while ((opt = getopt(argc, argv, "i:o:n:s:t:c:m:g:j:f:vxh")) != -1)
     {
         switch (opt)
         {
@@ -518,6 +523,9 @@ int main(int argc, char** argv)
         case 't':
             tilesize = parse_optarg_int_array(optarg);
             break;
+        case 'c':
+            syncgap = atoi(optarg);
+            break;
         case 'm':
             model = optarg;
             break;
@@ -569,6 +577,12 @@ int main(int argc, char** argv)
         return -1;
     }
 
+    if (!(syncgap == 0 || syncgap == 1 || syncgap == 2))
+    {
+        fprintf(stderr, "invalid syncgap argument\n");
+        return -1;
+    }
+
     for (int i=0; i<(int)tilesize.size(); i++)
     {
         if (tilesize[i] != 0 && tilesize[i] < 32)
@@ -708,11 +722,10 @@ int main(int argc, char** argv)
         return -1;
     }
 
-    int syncgap = 0;
-
-    if (model.find(PATHSTR("models-se")) != path_t::npos)
+    if (model.find(PATHSTR("models-nose")) != path_t::npos)
     {
-        syncgap = 1;
+        // force syncgap off for nose models
+        syncgap = 0;
     }
 
 #if _WIN32
diff --git a/src/realcugan.cpp b/src/realcugan.cpp
index 95aa512..32c245a 100644
--- a/src/realcugan.cpp
+++ b/src/realcugan.cpp
@@ -247,7 +247,12 @@ int RealCUGAN::process(const ncnn::Mat& inimage, ncnn::Mat& outimage) const
     {
         // cpu only
         if (syncgap_needed && syncgap)
-            return process_cpu_se(inimage, outimage);
+        {
+            if (syncgap == 1)
+                return process_cpu_se(inimage, outimage);
+            if (syncgap == 2)
+                return process_cpu_se_rough(inimage, outimage);
+        }
         else
             return process_cpu(inimage, outimage);
     }
@@ -259,7 +264,12 @@ int RealCUGAN::process(const ncnn::Mat& inimage, ncnn::Mat& outimage) const
     }
 
     if (syncgap_needed && syncgap)
-        return process_se(inimage, outimage);
+    {
+        if (syncgap == 1)
+            return process_se(inimage, outimage);
+        if (syncgap == 2)
+            return process_se_rough(inimage, outimage);
+    }
 
     const unsigned char* pixeldata = (const unsigned char*)inimage.data;
     const int w = inimage.w;
@@ -1141,6 +1151,36 @@ int RealCUGAN::process_se(const ncnn::Mat& inimage, ncnn::Mat& outimage) const
     return 0;
 }
 
+int RealCUGAN::process_se_rough(const ncnn::Mat& inimage, ncnn::Mat& outimage) const
+{
+    ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
+    ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
+
+    ncnn::Option opt = net.opt;
+    opt.blob_vkallocator = blob_vkallocator;
+    opt.workspace_vkallocator = blob_vkallocator;
+    opt.staging_vkallocator = staging_vkallocator;
+
+    FeatureCache cache;
+
+    std::vector<std::string> in0 = {};
+    std::vector<std::string> out0 = {"gap0", "gap1", "gap2", "gap3"};
+    process_se_stage0(inimage, in0, out0, opt, cache);
+
+    std::vector<std::string> gap0 = {"gap0", "gap1", "gap2", "gap3"};
+    process_se_sync_gap(inimage, gap0, opt, cache);
+
+    std::vector<std::string> in4 = {"gap0", "gap1", "gap2", "gap3"};
+    process_se_stage2(inimage, in4, outimage, opt, cache);
+
+    cache.clear();
+
+    vkdev->reclaim_blob_allocator(blob_vkallocator);
+    vkdev->reclaim_staging_allocator(staging_vkallocator);
+
+    return 0;
+}
+
 int RealCUGAN::process_cpu_se(const ncnn::Mat& inimage, ncnn::Mat& outimage) const
 {
     FeatureCache cache;
@@ -1181,6 +1221,25 @@ int RealCUGAN::process_cpu_se(const ncnn::Mat& inimage, ncnn::Mat& outimage) con
     return 0;
 }
 
+int RealCUGAN::process_cpu_se_rough(const ncnn::Mat& inimage, ncnn::Mat& outimage) const
+{
+    FeatureCache cache;
+
+    std::vector<std::string> in0 = {};
+    std::vector<std::string> out0 = {"gap0", "gap1", "gap2", "gap3"};
+    process_cpu_se_stage0(inimage, in0, out0, cache);
+
+    std::vector<std::string> gap0 = {"gap0", "gap1", "gap2", "gap3"};
+    process_cpu_se_sync_gap(inimage, gap0, cache);
+
+    std::vector<std::string> in4 = {"gap0", "gap1", "gap2", "gap3"};
+    process_cpu_se_stage2(inimage, in4, outimage, cache);
+
+    cache.clear();
+
+    return 0;
+}
+
 int RealCUGAN::process_se_stage0(const ncnn::Mat& inimage, const std::vector<std::string>& names, const std::vector<std::string>& outnames, const ncnn::Option& opt, FeatureCache& cache) const
 {
     const unsigned char* pixeldata = (const unsigned char*)inimage.data;
diff --git a/src/realcugan.h b/src/realcugan.h
index c93cf50..b8a4b8c 100644
--- a/src/realcugan.h
+++ b/src/realcugan.h
@@ -31,6 +31,10 @@ class RealCUGAN
 
     int process_cpu_se(const ncnn::Mat& inimage, ncnn::Mat& outimage) const;
 
+    int process_se_rough(const ncnn::Mat& inimage, ncnn::Mat& outimage) const;
+
+    int process_cpu_se_rough(const ncnn::Mat& inimage, ncnn::Mat& outimage) const;
+
 protected:
     int process_se_stage0(const ncnn::Mat& inimage, const std::vector<std::string>& names, const std::vector<std::string>& outnames, const ncnn::Option& opt, FeatureCache& cache) const;
     int process_se_stage2(const ncnn::Mat& inimage, const std::vector<std::string>& names, ncnn::Mat& outimage, const ncnn::Option& opt, FeatureCache& cache) const;