From a0882a6ef0597697871fb23eaa84ca8d9f74040d Mon Sep 17 00:00:00 2001 From: nihui Date: Wed, 16 Feb 2022 22:44:07 +0800 Subject: [PATCH] add syncgap option, use rough syncgap mode by default --- README.md | 6 +++-- src/main.cpp | 25 ++++++++++++++----- src/realcugan.cpp | 63 +++++++++++++++++++++++++++++++++++++++++++++-- src/realcugan.h | 4 +++ 4 files changed, 88 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index e23078a..169177c 100644 --- a/README.md +++ b/README.md @@ -5,13 +5,13 @@ ![CI](https://github.com/nihui/realcugan-ncnn-vulkan/workflows/CI/badge.svg) ![download](https://img.shields.io/github/downloads/nihui/realcugan-ncnn-vulkan/total.svg) -ncnn implementation of Real-CUGAN converter. Runs fast on Intel / AMD / Nvidia with Vulkan API. +ncnn implementation of Real-CUGAN converter. Runs fast on Intel / AMD / Nvidia / Apple-Silicon with Vulkan API. realcugan-ncnn-vulkan uses [ncnn project](https://github.com/Tencent/ncnn) as the universal neural network inference framework. ## [Download](https://github.com/nihui/realcugan-ncnn-vulkan/releases) -Download Windows/Linux/MacOS Executable for Intel/AMD/Nvidia GPU +Download Windows/Linux/MacOS Executable for Intel/AMD/Nvidia/Apple-Silicon GPU **https://github.com/nihui/realcugan-ncnn-vulkan/releases** @@ -43,6 +43,7 @@ Usage: realcugan-ncnn-vulkan -i infile -o outfile [options]... -n noise-level denoise level (-1/0/1/2/3, default=-1) -s scale upscale ratio (1/2/3/4, default=2) -t tile-size tile size (>=32/0=auto, default=0) can be 0,0,0 for multi-gpu + -c syncgap-mode sync gap mode (0/1/2, default=2) -m model-path realcugan model path (default=models-se) -g gpu-id gpu device to use (-1=cpu, default=auto) can be 0,1,2 for multi-gpu -j load:proc:save thread count for load/proc/save (default=1:2:2) can be 1:2,2,2:2 for multi-gpu @@ -54,6 +55,7 @@ Usage: realcugan-ncnn-vulkan -i infile -o outfile [options]... - `noise-level` = noise level, large value means strong denoise effect, -1 = no effect - `scale` = scale level, 1 = no scaling, 2 = upscale 2x - `tile-size` = tile size, use smaller value to reduce GPU memory usage, default selects automatically +- `syncgap-mode` = sync gap mode, 0 = no sync, 1 = accurate sync, 2 = rough sync - `load:proc:save` = thread count for the three stages (image decoding + realcugan upscaling + image encoding), using larger values may increase GPU usage and consume more GPU memory. You can tune this configuration with "4:4:4" for many small-size images, and "2:2:2" for large-size images. The default setting usually works fine for most situations. If you find that your GPU is hungry, try increasing thread count to achieve faster processing. - `format` = the format of the image to be output, png is better supported, however webp generally yields smaller file sizes, both are losslessly encoded diff --git a/src/main.cpp b/src/main.cpp index 2e2e1d1..8d01721 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -108,6 +108,7 @@ static void print_usage() fprintf(stdout, " -n noise-level denoise level (-1/0/1/2/3, default=-1)\n"); fprintf(stdout, " -s scale upscale ratio (1/2/3/4, default=2)\n"); fprintf(stdout, " -t tile-size tile size (>=32/0=auto, default=0) can be 0,0,0 for multi-gpu\n"); + fprintf(stdout, " -c syncgap-mode sync gap mode (0/1/2, default=2)\n"); fprintf(stdout, " -m model-path realcugan model path (default=models-se)\n"); fprintf(stdout, " -g gpu-id gpu device to use (-1=cpu, default=auto) can be 0,1,2 for multi-gpu\n"); fprintf(stdout, " -j load:proc:save thread count for load/proc/save (default=1:2:2) can be 1:2,2,2:2 for multi-gpu\n"); @@ -447,13 +448,14 @@ int main(int argc, char** argv) std::vector jobs_proc; int jobs_save = 2; int verbose = 0; + int syncgap = 2; int tta_mode = 0; path_t format = PATHSTR("png"); #if _WIN32 setlocale(LC_ALL, ""); wchar_t opt; - while ((opt = getopt(argc, argv, L"i:o:n:s:t:m:g:j:f:vxh")) != (wchar_t)-1) + while ((opt = getopt(argc, argv, L"i:o:n:s:t:c:m:g:j:f:vxh")) != (wchar_t)-1) { switch (opt) { @@ -472,6 +474,9 @@ int main(int argc, char** argv) case L't': tilesize = parse_optarg_int_array(optarg); break; + case L'c': + syncgap = _wtoi(optarg); + break; case L'm': model = optarg; break; @@ -499,7 +504,7 @@ int main(int argc, char** argv) } #else // _WIN32 int opt; - while ((opt = getopt(argc, argv, "i:o:n:s:t:m:g:j:f:vxh")) != -1) + while ((opt = getopt(argc, argv, "i:o:n:s:t:c:m:g:j:f:vxh")) != -1) { switch (opt) { @@ -518,6 +523,9 @@ int main(int argc, char** argv) case 't': tilesize = parse_optarg_int_array(optarg); break; + case 'c': + syncgap = atoi(optarg); + break; case 'm': model = optarg; break; @@ -569,6 +577,12 @@ int main(int argc, char** argv) return -1; } + if (!(syncgap == 0 || syncgap == 1 || syncgap == 2)) + { + fprintf(stderr, "invalid syncgap argument\n"); + return -1; + } + for (int i=0; i<(int)tilesize.size(); i++) { if (tilesize[i] != 0 && tilesize[i] < 32) @@ -708,11 +722,10 @@ int main(int argc, char** argv) return -1; } - int syncgap = 0; - - if (model.find(PATHSTR("models-se")) != path_t::npos) + if (model.find(PATHSTR("models-nose")) != path_t::npos) { - syncgap = 1; + // force syncgap off for nose models + syncgap = 0; } #if _WIN32 diff --git a/src/realcugan.cpp b/src/realcugan.cpp index 95aa512..32c245a 100644 --- a/src/realcugan.cpp +++ b/src/realcugan.cpp @@ -247,7 +247,12 @@ int RealCUGAN::process(const ncnn::Mat& inimage, ncnn::Mat& outimage) const { // cpu only if (syncgap_needed && syncgap) - return process_cpu_se(inimage, outimage); + { + if (syncgap == 1) + return process_cpu_se(inimage, outimage); + if (syncgap == 2) + return process_cpu_se_rough(inimage, outimage); + } else return process_cpu(inimage, outimage); } @@ -259,7 +264,12 @@ int RealCUGAN::process(const ncnn::Mat& inimage, ncnn::Mat& outimage) const } if (syncgap_needed && syncgap) - return process_se(inimage, outimage); + { + if (syncgap == 1) + return process_se(inimage, outimage); + if (syncgap == 2) + return process_se_rough(inimage, outimage); + } const unsigned char* pixeldata = (const unsigned char*)inimage.data; const int w = inimage.w; @@ -1141,6 +1151,36 @@ int RealCUGAN::process_se(const ncnn::Mat& inimage, ncnn::Mat& outimage) const return 0; } +int RealCUGAN::process_se_rough(const ncnn::Mat& inimage, ncnn::Mat& outimage) const +{ + ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator(); + ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator(); + + ncnn::Option opt = net.opt; + opt.blob_vkallocator = blob_vkallocator; + opt.workspace_vkallocator = blob_vkallocator; + opt.staging_vkallocator = staging_vkallocator; + + FeatureCache cache; + + std::vector in0 = {}; + std::vector out0 = {"gap0", "gap1", "gap2", "gap3"}; + process_se_stage0(inimage, in0, out0, opt, cache); + + std::vector gap0 = {"gap0", "gap1", "gap2", "gap3"}; + process_se_sync_gap(inimage, gap0, opt, cache); + + std::vector in4 = {"gap0", "gap1", "gap2", "gap3"}; + process_se_stage2(inimage, in4, outimage, opt, cache); + + cache.clear(); + + vkdev->reclaim_blob_allocator(blob_vkallocator); + vkdev->reclaim_staging_allocator(staging_vkallocator); + + return 0; +} + int RealCUGAN::process_cpu_se(const ncnn::Mat& inimage, ncnn::Mat& outimage) const { FeatureCache cache; @@ -1181,6 +1221,25 @@ int RealCUGAN::process_cpu_se(const ncnn::Mat& inimage, ncnn::Mat& outimage) con return 0; } +int RealCUGAN::process_cpu_se_rough(const ncnn::Mat& inimage, ncnn::Mat& outimage) const +{ + FeatureCache cache; + + std::vector in0 = {}; + std::vector out0 = {"gap0", "gap1", "gap2", "gap3"}; + process_cpu_se_stage0(inimage, in0, out0, cache); + + std::vector gap0 = {"gap0", "gap1", "gap2", "gap3"}; + process_cpu_se_sync_gap(inimage, gap0, cache); + + std::vector in4 = {"gap0", "gap1", "gap2", "gap3"}; + process_cpu_se_stage2(inimage, in4, outimage, cache); + + cache.clear(); + + return 0; +} + int RealCUGAN::process_se_stage0(const ncnn::Mat& inimage, const std::vector& names, const std::vector& outnames, const ncnn::Option& opt, FeatureCache& cache) const { const unsigned char* pixeldata = (const unsigned char*)inimage.data; diff --git a/src/realcugan.h b/src/realcugan.h index c93cf50..b8a4b8c 100644 --- a/src/realcugan.h +++ b/src/realcugan.h @@ -31,6 +31,10 @@ class RealCUGAN int process_cpu_se(const ncnn::Mat& inimage, ncnn::Mat& outimage) const; + int process_se_rough(const ncnn::Mat& inimage, ncnn::Mat& outimage) const; + + int process_cpu_se_rough(const ncnn::Mat& inimage, ncnn::Mat& outimage) const; + protected: int process_se_stage0(const ncnn::Mat& inimage, const std::vector& names, const std::vector& outnames, const ncnn::Option& opt, FeatureCache& cache) const; int process_se_stage2(const ncnn::Mat& inimage, const std::vector& names, ncnn::Mat& outimage, const ncnn::Option& opt, FeatureCache& cache) const;