Merge pull request #11 from creativeIKEP/develop

v1.2.0
creativeIKEP · Feb 15, 2022 · be6db72 · be6db72
2 parents 88f45dd + 426c279
commit be6db72
Show file tree

Hide file tree

Showing 20 changed files with 268 additions and 295 deletions.
diff --git a/Assets/Images/Dance.mp4 b/Assets/Images/Dance.mp4
diff --git a/Assets/Images/Fitness.mp4 b/Assets/Images/Fitness.mp4
diff --git a/Assets/Images/SignLanguage.mp4 b/Assets/Images/SignLanguage.mp4
diff --git a/Assets/Images/Dance.mp4.meta → Assets/Images/SignLanguage.mp4.meta b/Assets/Images/Dance.mp4.meta → Assets/Images/SignLanguage.mp4.meta
diff --git a/Assets/Images/jump.mp4 b/Assets/Images/jump.mp4
diff --git a/Assets/Images/Fitness.mp4.meta → Assets/Images/jump.mp4.meta b/Assets/Images/Fitness.mp4.meta → Assets/Images/jump.mp4.meta
diff --git a/Assets/Scenes/2DSampleScene.unity b/Assets/Scenes/2DSampleScene.unity
@@ -178,7 +178,7 @@ VideoPlayer:
   m_PrefabAsset: {fileID: 0}
   m_GameObject: {fileID: 814657130}
   m_Enabled: 1
-  m_VideoClip: {fileID: 32900000, guid: 6f19fa9b6a47cd643bcd9dfde6e6e2c5, type: 3}
+  m_VideoClip: {fileID: 32900000, guid: 750a2e5ba4c61744eafb92fd15b2b8b8, type: 3}
   m_TargetCameraAlpha: 1
   m_TargetCamera3DLayout: 0
   m_TargetCamera: {fileID: 0}

diff --git a/Assets/Scenes/3DSampleScene.unity b/Assets/Scenes/3DSampleScene.unity
@@ -336,7 +336,7 @@ VideoPlayer:
   m_PrefabAsset: {fileID: 0}
   m_GameObject: {fileID: 636196907}
   m_Enabled: 1
-  m_VideoClip: {fileID: 32900000, guid: 331e98e799a96d845bfe6c1a73a92c37, type: 3}
+  m_VideoClip: {fileID: 32900000, guid: 2258e888fb73969408bc31cee3e518d3, type: 3}
   m_TargetCameraAlpha: 1
   m_TargetCamera3DLayout: 0
   m_TargetCamera: {fileID: 0}

diff --git a/Assets/Script/PoseVisuallizer.cs b/Assets/Script/PoseVisuallizer.cs
@@ -8,9 +8,7 @@ public class PoseVisuallizer : MonoBehaviour
     [SerializeField] WebCamInput webCamInput;
     [SerializeField] RawImage inputImageUI;
     [SerializeField] Shader shader;
-    [SerializeField] BlazePoseResource blazePoseResource;
     [SerializeField, Range(0, 1)] float humanExistThreshold = 0.5f;
-    [SerializeField] BlazePoseModel poseLandmarkModel;
 
     Material material;
     BlazePoseDetecter detecter;
@@ -32,22 +30,43 @@ public class PoseVisuallizer : MonoBehaviour
 
     void Start(){
         material = new Material(shader);
-        detecter = new BlazePoseDetecter(blazePoseResource, poseLandmarkModel);
+        detecter = new BlazePoseDetecter();
     }
 
     void LateUpdate(){
         inputImageUI.texture = webCamInput.inputImageTexture;
 
         // Predict pose by neural network model.
-        // Switchable anytime models with 2nd argment.
-        detecter.ProcessImage(webCamInput.inputImageTexture, poseLandmarkModel);
+        detecter.ProcessImage(webCamInput.inputImageTexture);
+
+        // Output landmark values(33 values) and the score whether human pose is visible (1 values).
+        for(int i = 0; i < detecter.vertexCount + 1; i++){
+            /*
+            0~32 index datas are pose landmark.
+            Check below Mediapipe document about relation between index and landmark position.
+            https://google.github.io/mediapipe/solutions/pose#pose-landmark-model-blazepose-ghum-3d
+            Each data factors are
+            x: x cordinate value of pose landmark ([0, 1]).
+            y: y cordinate value of pose landmark ([0, 1]).
+            z: Landmark depth with the depth at the midpoint of hips being the origin.
+               The smaller the value the closer the landmark is to the camera. ([0, 1]).
+               This value is full body mode only.
+               **The use of this value is not recommended beacuse in development.**
+            w: The score of whether the landmark position is visible ([0, 1]).
+        
+            33 index data is the score whether human pose is visible ([0, 1]).
+            This data is (score, 0, 0, 0).
+            */
+            Debug.LogFormat("{0}: {1}", i, detecter.GetPoseLandmark(i));
+        }
+        Debug.Log("---");
     } 
 
     void OnRenderObject(){
         var w = inputImageUI.rectTransform.rect.width;
         var h = inputImageUI.rectTransform.rect.height;
 
-        // Set predicted pose landmark results.
+        // Use predicted pose landmark results on the ComputeBuffer (GPU) memory.
         material.SetBuffer("_vertices", detecter.outputBuffer);
         // Set pose landmark counts.
         material.SetInt("_keypointCount", detecter.vertexCount);

diff --git a/Assets/Script/PoseVisuallizer3D.cs b/Assets/Script/PoseVisuallizer3D.cs
@@ -9,9 +9,7 @@ public class PoseVisuallizer3D : MonoBehaviour
     [SerializeField] WebCamInput webCamInput;
     [SerializeField] RawImage inputImageUI;
     [SerializeField] Shader shader;
-    [SerializeField] BlazePoseResource blazePoseResource;
     [SerializeField, Range(0, 1)] float humanExistThreshold = 0.5f;
-    [SerializeField] BlazePoseModel poseLandmarkModel;
 
     Material material;
     BlazePoseDetecter detecter;
@@ -33,7 +31,7 @@ public class PoseVisuallizer3D : MonoBehaviour
 
     void Start(){
         material = new Material(shader);
-        detecter = new BlazePoseDetecter(blazePoseResource, poseLandmarkModel);
+        detecter = new BlazePoseDetecter();
     }
 
     void Update(){
@@ -44,12 +42,28 @@ void LateUpdate(){
         inputImageUI.texture = webCamInput.inputImageTexture;
 
         // Predict pose by neural network model.
-        // Switchable anytime models with 2nd argment.
-        detecter.ProcessImage(webCamInput.inputImageTexture, poseLandmarkModel);
+        detecter.ProcessImage(webCamInput.inputImageTexture);
+
+        // Output landmark values(33 values) and the score whether human pose is visible (1 values).
+        for(int i = 0; i < detecter.vertexCount + 1; i++){
+            /*
+            0~32 index datas are pose world landmark.
+            Check below Mediapipe document about relation between index and landmark position.
+            https://google.github.io/mediapipe/solutions/pose#pose-landmark-model-blazepose-ghum-3d
+            Each data factors are
+            x, y and z: Real-world 3D coordinates in meters with the origin at the center between hips.
+            w: The score of whether the world landmark position is visible ([0, 1]).
+        
+            33 index data is the score whether human pose is visible ([0, 1]).
+            This data is (score, 0, 0, 0).
+            */
+            Debug.LogFormat("{0}: {1}", i, detecter.GetPoseWorldLandmark(i));
+        }
+        Debug.Log("---");
     } 
 
     void OnRenderObject(){
-        // Set predicted pose world landmark results.
+        // Use predicted pose world landmark results on the ComputeBuffer (GPU) memory.
         material.SetBuffer("_worldVertices", detecter.worldLandmarkBuffer);
         // Set pose landmark counts.
         material.SetInt("_keypointCount", detecter.vertexCount);

diff --git a/Packages/BlazePoseBarracuda/CHANGELOG.md b/Packages/BlazePoseBarracuda/CHANGELOG.md
@@ -1,3 +1,10 @@
+### [1.2.0] - 2021-02-15
+- Improve the stability of pose estimation.
+- Fixed an issue that pose estimation was not performed correctly when the color space was Liner.
+- Add new methods (`GetPoseLandmark` and `GetPoseWorldLandmark`) for accessing data with CPU (C#).
+- Automatically load `BlazePoseResource` asset data. The constructor arguments are not required.
+- The `BlazePoseModel` argument is now optional. The only argument in the inference is the `Texture` data.
+
 ### [1.1.1] - 2021-10-04
 Change the install method from GitHub URL to [npmjs.com](https://www.npmjs.com/).
 Source code change is nothing.

diff --git a/Packages/BlazePoseBarracuda/ComputeShader/Process.compute b/Packages/BlazePoseBarracuda/ComputeShader/Process.compute
@@ -5,17 +5,19 @@
 
 #define PI 3.14159265359
 
+#include "UnityCG.cginc"
 #include "PoseRegion.cginc"
 #include "LowPassFilter.cginc"
 #include "Misc.cginc"
 #include "Packages/jp.ikep.mediapipe.posedetection/ComputeShader/Common.cginc"
 
 
 // Kernel 0
+uint _isLinerColorSpace;
 uint _letterboxWidth;
 float2 _spadScale;
 sampler2D _letterboxInput;
-RWTexture2D<float4> _letterboxTexture;
+RWStructuredBuffer<float> _letterboxTextureBuffer;
 
 // Generate letter-box image texture.
 [numthreads(8, 8, 1)]
@@ -24,7 +26,7 @@ void LetterBoxImage(uint2 id : SV_DispatchThreadID)
     if (any(id > _letterboxWidth)) return;
 
     // UV coordinates
-    float2 uv = (id + 0.5) / _letterboxWidth;
+    float2 uv = float2(0.5 + id.x, _letterboxWidth - 0.5 - id.y) / _letterboxWidth;
 
     // Scaling
     uv = (uv - 0.5) * _spadScale + 0.5;
@@ -39,7 +41,13 @@ void LetterBoxImage(uint2 id : SV_DispatchThreadID)
     // Bounding
     rgb *= all(uv > 0) && all (uv < 1);
 
-    _letterboxTexture[id] = float4(rgb, 1);
+    // Comvert sRGB color (= Liner color space) because Compute Shader texture output is not converted.
+    if(_isLinerColorSpace) rgb = LinearToGammaSpace(rgb);
+
+    uint offs = (id.y * _letterboxWidth + id.x) * 3;
+    _letterboxTextureBuffer[offs + 0] = rgb.r;
+    _letterboxTextureBuffer[offs + 1] = rgb.g;
+    _letterboxTextureBuffer[offs + 2] = rgb.b;
 }
 
 
@@ -106,7 +114,7 @@ void PoseRegionUpdate(uint id : SV_DispatchThreadID)
 
 sampler2D _inputTexture;
 StructuredBuffer<PoseRegion> _cropRegion;
-RWTexture2D<float4> _cropedTexture;
+RWStructuredBuffer<float> _cropedTextureBuffer;
 
 // Crop image from PoseRegion.
 [numthreads(8, 8, 1)]
@@ -115,7 +123,7 @@ void CropImage(uint2 id : SV_DispatchThreadID)
     float4x4 xform = _cropRegion[0].cropMatrix;
 
     // UV coordinates
-    float2 uv = (id + 0.5) / CROP_IMAGE_SIZE;
+    float2 uv = float2(0.5 + id.x, CROP_IMAGE_SIZE - 0.5 - id.y) / CROP_IMAGE_SIZE;
     uv = mul(xform, float4(uv, 0, 1)).xy;
 
     // De-letterboxing
@@ -128,23 +136,32 @@ void CropImage(uint2 id : SV_DispatchThreadID)
     // Texture sample
     float3 rgb = tex2Dgrad(_inputTexture, uv, duv_dx, duv_dy).rgb;
 
-    // Set pixel value.
-    _cropedTexture[id] = float4(rgb, 1);
+    // Comvert sRGB color (= Liner color space) because Compute Shader texture output is not converted.
+    if(_isLinerColorSpace) rgb = LinearToGammaSpace(rgb);
+
+    uint offs = (id.y * CROP_IMAGE_SIZE + id.x) * 3;
+    _cropedTextureBuffer[offs + 0] = rgb.r;
+    _cropedTextureBuffer[offs + 1] = rgb.g;
+    _cropedTextureBuffer[offs + 2] = rgb.b;
 }
 
 
 // Kernel 3
+
+#define LPF_WINDOW_MAX_COUNT 5
+
+uint _isWorldProcess;
 uint _keypointCount;
 float _postDeltatime;
+float _velocity_scale;
+uint _rvfWindowCount;
 
 StructuredBuffer<float4> _postInput;
-StructuredBuffer<float4> _postInputWorld;
 StructuredBuffer<PoseRegion> _postRegion;
 
-RWStructuredBuffer<float4> _postDeltaOutput;
+RWStructuredBuffer<float> _postRvfWindowBuffer;
+RWStructuredBuffer<float3> _postLastValueScale;
 RWStructuredBuffer<float4> _postOutput;
-RWStructuredBuffer<float4> _postDeltaOutputWorld;
-RWStructuredBuffer<float4> _postOutputWorld;
 
 // Map pose landmark to cordinates on the original input image.
 [numthreads(33 + 1, 1, 1)]
@@ -155,7 +172,6 @@ void PostProcess(uint id : SV_DispatchThreadID)
     if(id == _keypointCount){
         // Set human exist score in last index.
         _postOutput[id] = _postInput[_keypointCount];
-        _postOutputWorld[id] = _postInputWorld[_keypointCount];
     }
     else{
         // Process for normalized landmark
@@ -166,45 +182,81 @@ void PostProcess(uint id : SV_DispatchThreadID)
         float3 x = _postInput[id].xyz;
         // Pose landmark on previous frame
         float3 p_x = _postOutput[id].xyz;
-        // Pose landmark delta
-        float3 p_dx = _postDeltaOutput[id].xyz;
-
-        // Map to cordinates of letter-box image from croped image.
-        x = mul(region.cropMatrix, float4(x, 1)).xyz;
-        // Map to cordinates of original input image from letter-box image.
-        x.xy = (x.xy - 0.5) * _spadScale + 0.5;
-
-        // Calculate with low pass filter for decrease jitter.
-        float3 lpf_params = float3(30, 1.5, _postDeltatime);
-        float3 dx = lpf_Step_dx(x, p_x, p_dx, lpf_params);
-        x = lpf_Step_x(x, p_x, dx, lpf_params);
-
-        // Set pose landmark delta for the next frame processing.
-        _postDeltaOutput[id] = float4(dx, 1);
-        // Pose landmark final result
-        _postOutput[id] = float4(x, score);
+        // previous value scale for relative velocity filter
+        float3 p_value_scale = _postLastValueScale[id];
+
+        if(_isWorldProcess){
+            x = mul(makeRotationMatrix(region.box.w), float4(x, 1)).xyz;
+        }
+        else{
+            // Map to cordinates of letter-box image from croped image.
+            x = mul(region.cropMatrix, float4(x, 1)).xyz;
+            // Map to cordinates of original input image from letter-box image.
+            x.xy = (x.xy - 0.5) * _spadScale + 0.5;
+        }
+
+        // Apply relative velocity filter
+        // reference: 
+        // https://github.com/asus4/tf-lite-unity-sample/blob/26e49bf4a45a550f84f12635a97102a3e207009e/Packages/com.github.asus4.mediapipe/Runtime/RelativeVelocityFilter.cs
+        float min_x = _postInput[0].x;
+        float max_x = _postInput[0].x;
+        float min_y = _postInput[0].y;
+        float max_y = _postInput[0].y;
+        for(uint k = 0; k < _keypointCount; k++){
+            if(min_x > _postInput[k].x) min_x = _postInput[k].x;
+            if(max_x < _postInput[k].x) max_x = _postInput[k].x;
+            if(min_y > _postInput[k].y) min_y = _postInput[k].y;
+            if(max_y < _postInput[k].y) max_y = _postInput[k].y;
+        }
+
+        float2 size = float2(max_x - min_x, max_y - min_y);
+        float3 value_scale = 1.0f / ((size.x + size.y) / 2);
+        int window_size = min(_rvfWindowCount, LPF_WINDOW_MAX_COUNT);
+        if(window_size > 0){
+            float3 distance = x * value_scale - p_x * p_value_scale;
+            float3 distance_sum = distance;
+            float duration_sum = _postDeltatime;
+            float duration_max = (1 + window_size) * _postDeltatime;
+
+            for(int i = 0; i < window_size; i++){
+                uint offs = (i * _keypointCount + id) * 4;
+                float3 p_distance = float3(_postRvfWindowBuffer[offs + 0], _postRvfWindowBuffer[offs + 1], _postRvfWindowBuffer[offs + 2]);
+                float p_duration = _postRvfWindowBuffer[offs + 3];
+                if(duration_sum + p_duration > duration_max) break;
+                distance_sum += p_distance;
+                duration_sum +=p_duration;
+            }
+
+            float3 velocity = distance_sum / duration_sum;
+            float3 alpha = 1.0 - 1.0 / (1.0 + _velocity_scale * abs(velocity));
+            x = lerp(p_x, x, alpha);
+
+            // window enqueue & dequeue for next frame
+            if(window_size == LPF_WINDOW_MAX_COUNT){
+                for(int j = 0; j < window_size - 1; j++){
+                    uint offs1 = (j * _keypointCount + id) * 4;
+                    uint offs2 = ((j + 1) * _keypointCount + id) * 4;
+                    _postRvfWindowBuffer[offs1 + 0] = _postRvfWindowBuffer[offs2 + 0];
+                    _postRvfWindowBuffer[offs1 + 1] = _postRvfWindowBuffer[offs2 + 1];
+                    _postRvfWindowBuffer[offs1 + 2] = _postRvfWindowBuffer[offs2 + 2];
+                    _postRvfWindowBuffer[offs1 + 3] = _postRvfWindowBuffer[offs2 + 3];
+                }
+                uint offs = ((window_size - 1) * _keypointCount + id) * 4;
+                _postRvfWindowBuffer[offs + 0] = distance.x;
+                _postRvfWindowBuffer[offs + 1] = distance.y;
+                _postRvfWindowBuffer[offs + 2] = distance.z;
+                _postRvfWindowBuffer[offs + 3] = _postDeltatime;
+            }
+            else{
+                uint offs = (window_size * _keypointCount + id) * 4;
+                _postRvfWindowBuffer[offs + 0] = distance.x;
+                _postRvfWindowBuffer[offs + 1] = distance.y;
+                _postRvfWindowBuffer[offs + 2] = distance.z;
+                _postRvfWindowBuffer[offs + 3] = _postDeltatime;
+            }
+        }
 
-        // ---------
-
-        // Process for world landmark
-        // Visiblity of pose world landmark
-        score = _postInputWorld[id].w;
-        // Pose world landmark
-        x = _postInputWorld[id].xyz;
-        // Pose world landmark on previous frame
-        p_x = _postOutputWorld[id].xyz;
-        // Pose world landmark delta
-        p_dx =_postDeltaOutputWorld[id].xyz;
-
-        x = mul(makeRotationMatrix(region.box.w), float4(x, 1)).xyz;
-
-        // Calculate with low pass filter for decrease jitter.
-        dx = lpf_Step_dx(x, p_x, p_dx, lpf_params);
-        x = lpf_Step_x(x, p_x, dx, lpf_params);
-
-        // Set pose world landmark delta for the next frame processing.
-        _postDeltaOutputWorld[id] = float4(dx, 1);
-        // Pose world landmark final result
-        _postOutputWorld[id] = float4(x, score);
+        _postOutput[id] = float4(x, score);
+        _postLastValueScale[id] = value_scale;
     }
 }