From 38b5da21ef2f3db79f08fb707d8d293759076b05 Mon Sep 17 00:00:00 2001
From: fo76utils <87907510+fo76utils@users.noreply.github.com>
Date: Thu, 17 Oct 2024 23:52:10 +0200
Subject: [PATCH] Updated meshoptimizer

---
 lib/meshoptimizer/README.md       |  43 +++++++++-
 lib/meshoptimizer/allocator.cpp   |   2 +-
 lib/meshoptimizer/meshoptimizer.h |  14 ++--
 lib/meshoptimizer/simplifier.cpp  | 128 +++++++++++++++++-------------
 lib/meshoptimizer/vertexcodec.cpp |  79 ++++++++++++++++++
 5 files changed, 201 insertions(+), 65 deletions(-)

diff --git a/lib/meshoptimizer/README.md b/lib/meshoptimizer/README.md
index 56928130..fbba91fe 100644
--- a/lib/meshoptimizer/README.md
+++ b/lib/meshoptimizer/README.md
@@ -180,8 +180,6 @@ assert(resvb == 0 && resib == 0);
 
 Note that vertex encoding assumes that vertex buffer was optimized for vertex fetch, and that vertices are quantized; index encoding assumes that the vertex/index buffers were optimized for vertex cache and vertex fetch. Feeding unoptimized data into the encoders will produce poor compression ratios. Both codecs are lossless - the only lossy step is quantization that happens before encoding.
 
-To reduce the data size further, it's recommended to use `meshopt_optimizeVertexCacheStrip` instead of `meshopt_optimizeVertexCache` when optimizing for vertex cache, and to use new index codec version (`meshopt_encodeIndexVersion(1)`). This trades off some efficiency in vertex transform for smaller vertex and index data.
-
 Decoding functions are heavily optimized and can directly target write-combined memory; you can expect both decoders to run at 1-3 GB/s on modern desktop CPUs. Compression ratios depend on the data; vertex data compression ratio is typically around 2-4x (compared to already quantized data), index data compression ratio is around 5-6x (compared to raw 16-bit index data). General purpose lossless compressors can further improve on these results.
 
 Index buffer codec only supports triangle list topology; when encoding triangle strips or line lists, use `meshopt_encodeIndexSequence`/`meshopt_decodeIndexSequence` instead. This codec typically encodes indices into ~1 byte per index, but compressing the results further with a general purpose compressor can improve the results to 1-3 bits per index.
@@ -220,6 +218,42 @@ meshopt_remapVertexBuffer(positions, positions, point_count, sizeof(vec3), &rema
 
 After this the resulting arrays should be quantized (e.g. using 16-bit fixed point numbers for positions and 8-bit color components), and the result can be compressed using `meshopt_encodeVertexBuffer` as described in the previous section. To decompress, `meshopt_decodeVertexBuffer` will recover the quantized data that can be used directly or converted back to original floating-point data. The compression ratio depends on the nature of source data, for colored points it's typical to get 35-40 bits per point as a result.
 
+## Advanced compression
+
+Both vertex and index codecs are designed to be used in a three-stage pipeline:
+
+- Preparation (quantization, filtering, ordering)
+- Encoding (`meshopt_encodeVertexBuffer`/`meshopt_encodeIndexBuffer`)
+- Optional compression (LZ4/zlib/zstd/Oodle)
+
+The preparation stage is crucial for achieving good compression ratios; this section will cover some techniques that can be used to improve the results.
+
+The index codec targets 1 byte per triangle as a best case; on real-world data, it's typical to achieve 1-1.2 bytes per triangle. To reach this, the data needs to be optimized for vertex cache and vertex fetch. Optimizations that do not disrupt triangle locality (such as overdraw) are safe to use in between.
+To reduce the data size further, it's possible to use `meshopt_optimizeVertexCacheStrip` instead of `meshopt_optimizeVertexCache` when optimizing for vertex cache. This trades off some efficiency in vertex transform for smaller vertex and index data.
+
+When referenced vertex indices are not sequential, the index codec will use around 2 bytes per index. This can happen when the referenced vertices are a sparse subset of the vertex buffer, such as when encoding LODs. General-purpose compression can be especially helpful in this case.
+
+The vertex codec tries to take advantage of the inherent locality of sequential vertices and identify bit patterns that repeat in consecutive vertices. Typically, vertex cache + vertex fetch provides a reasonably local vertex traversal order; without an index buffer, it is recommended to sort vertices spatially to improve the compression ratio.
+It is crucial to correctly specify the stride when encoding vertex data; however, it does not matter whether the vertices are interleaved or deinterleaved, as the codecs perform full byte deinterleaving internally.
+
+For optimal compression results, the values must be quantized to small integers. It can be valuable to use bit counts that are not multiples of 8. For example, instead of using 16 bits to represent texture coordinates, use 12-bit integers and divide by 4095 in the shader. Alternatively, using half-precision floats can often achieve good results.
+For single-precision floating-point data, it's recommended to use `meshopt_quantizeFloat` to remove entropy from the lower bits of the mantissa. Due to current limitations of the codec, the bit count needs to be 15 (23-8) for good results (7 can be used for more extreme compression).
+For normal or tangent vectors, using octahedral encoding is recommended over three components as it reduces redundancy. Similarly to other quantized values, consider using 10-12 bits per component instead of 16.
+
+> Note: vertex codec v0 is limited to taking advantage of redundancy in high bits of each byte. Because of this, packing multiple 10-bit values into 32 bits will reduce compression ratio, and when storing a 12-bit value in 16 bits, high bits should be zeroed out. This limitation may be lifted in future versions of the codec.
+
+To further leverage the inherent structure of some data, the preparation stage can use filters that encode and decode the data in a lossy manner. This is similar to quantization but can be used without having to change the shader code. After decoding, the filter transformation needs to be reversed. This library provides three filters:
+
+- Octahedral filter (`meshopt_encodeFilterOct`/`meshopt_decodeFilterOct`) encodes quantized (snorm) normal or tangent vectors using octahedral encoding. Any number of bits <= 16 can be used with 4 bytes or 8 bytes per vector.
+- Quaternion filter (`meshopt_encodeFilterQuat`/`meshopt_decodeFilterQuat`) encodes quantized (snorm) quaternion vectors; this can be used to encode rotations or tangent frames. Any number of bits between 4 and 16 can be used with 8 bytes per vector.
+- Exponential filter (`meshopt_encodeFilterExp`/`meshopt_decodeFilterExp`) encodes single-precision floating-point vectors; this can be used to encode arbitrary floating-point data more efficiently. In addition to an arbitrary bit count (<= 24), the filter takes a "mode" parameter that allows specifying how the exponent sharing is performed to trade off compression ratio and quality:
+
+    - `meshopt_EncodeExpSeparate` does not share exponents and results in the largest output
+    - `meshopt_EncodeExpSharedVector` shares exponents between different components of the same vector
+    - `meshopt_EncodeExpSharedComponent` shares exponents between the same component in different vectors
+
+Note that all filters are lossy and require the data to be deinterleaved with one attribute per stream; this faciliates efficient SIMD implementation of filter decoders, allowing the overall decompression speed to be close to that of the raw codec.
+
 ## Triangle strip conversion
 
 On most hardware, indexed triangle lists are the most efficient way to drive the GPU. However, in some cases triangle strips might prove beneficial:
@@ -306,9 +340,10 @@ This algorithm will not stop early due to topology restrictions but can still do
 Both algorithms can also return the resulting normalized deviation that can be used to choose the correct level of detail based on screen size or solid angle; the error can be converted to object space by multiplying by the scaling factor returned by `meshopt_simplifyScale`. For example, given a mesh with a precomputed LOD and a prescaled error, the screen-space normalized error can be computed and used for LOD selection:
 
 ```c++
+// lod_factor can be 1 or can be adjusted for more or less aggressive LOD selection
 float d = max(0, distance(camera_position, mesh_center) - mesh_radius);
-float e = d * (tan(camera_fovy / 2) * 2 / 1000); // assume ~1000 px vertical resolution
-bool lod_ok = e * lod_factor >= lod_error; // lod_factor can be 1 or can be adjusted for more or less aggressive LOD selection
+float e = d * (tan(camera_fovy / 2) * 2 / screen_height); // 1px in mesh space
+bool lod_ok = e * lod_factor >= lod_error;
 ```
 
 When a sequence of LOD meshes is generated that all use the original vertex buffer, care must be taken to order vertices optimally to not penalize mobile GPU architectures that are only capable of transforming a sequential vertex buffer range. It's recommended in this case to first optimize each LOD for vertex cache, then assemble all LODs in one large index buffer starting from the coarsest LOD (the one with fewest triangles), and call `meshopt_optimizeVertexFetch` on the final large index buffer. This will make sure that coarser LODs require a smaller vertex range and are efficient wrt vertex fetch and transform.
diff --git a/lib/meshoptimizer/allocator.cpp b/lib/meshoptimizer/allocator.cpp
index 12eda387..b8cb33c2 100644
--- a/lib/meshoptimizer/allocator.cpp
+++ b/lib/meshoptimizer/allocator.cpp
@@ -1,7 +1,7 @@
 // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
 #include "meshoptimizer.h"
 
-void meshopt_setAllocator(void*(MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void(MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*))
+void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*))
 {
 	meshopt_Allocator::Storage::allocate = allocate;
 	meshopt_Allocator::Storage::deallocate = deallocate;
diff --git a/lib/meshoptimizer/meshoptimizer.h b/lib/meshoptimizer/meshoptimizer.h
index feab7bb0..6c42f67f 100644
--- a/lib/meshoptimizer/meshoptimizer.h
+++ b/lib/meshoptimizer/meshoptimizer.h
@@ -35,7 +35,8 @@
 
 /* C interface */
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
 /**
@@ -269,6 +270,7 @@ MESHOPTIMIZER_API int meshopt_decodeIndexSequence(void* destination, size_t inde
  * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
  * This function works for a single vertex stream; for multiple vertex streams, call meshopt_encodeVertexBuffer for each stream.
  * Note that all vertex_size bytes of each vertex are encoded verbatim, including padding which should be zero-initialized.
+ * For maximum efficiency the vertex buffer being encoded has to be quantized and optimized for locality of reference (cache/fetch) first.
  *
  * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
  */
@@ -600,7 +602,7 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* desti
  * Note that all algorithms only allocate memory for temporary use.
  * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first.
  */
-MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*));
+MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*));
 
 #ifdef __cplusplus
 } /* extern "C" */
@@ -748,8 +750,8 @@ class meshopt_Allocator
 	template <typename T>
 	struct StorageT
 	{
-		static void* (MESHOPTIMIZER_ALLOC_CALLCONV *allocate)(size_t);
-		static void (MESHOPTIMIZER_ALLOC_CALLCONV *deallocate)(void*);
+		static void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t);
+		static void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*);
 	};
 
 	typedef StorageT<void> Storage;
@@ -789,9 +791,9 @@ class meshopt_Allocator
 
 // This makes sure that allocate/deallocate are lazily generated in translation units that need them and are deduplicated by the linker
 template <typename T>
-void* (MESHOPTIMIZER_ALLOC_CALLCONV *meshopt_Allocator::StorageT<T>::allocate)(size_t) = operator new;
+void* (MESHOPTIMIZER_ALLOC_CALLCONV* meshopt_Allocator::StorageT<T>::allocate)(size_t) = operator new;
 template <typename T>
-void (MESHOPTIMIZER_ALLOC_CALLCONV *meshopt_Allocator::StorageT<T>::deallocate)(void*) = operator delete;
+void (MESHOPTIMIZER_ALLOC_CALLCONV* meshopt_Allocator::StorageT<T>::deallocate)(void*) = operator delete;
 #endif
 
 /* Inline implementation for C++ templated wrappers */
diff --git a/lib/meshoptimizer/simplifier.cpp b/lib/meshoptimizer/simplifier.cpp
index 927da0e1..af64cbda 100644
--- a/lib/meshoptimizer/simplifier.cpp
+++ b/lib/meshoptimizer/simplifier.cpp
@@ -369,12 +369,7 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
 	{
 		if (remap[i] == i)
 		{
-			if (vertex_lock && vertex_lock[sparse_remap ? sparse_remap[i] : i])
-			{
-				// vertex is explicitly locked
-				result[i] = Kind_Locked;
-			}
-			else if (wedge[i] == i)
+			if (wedge[i] == i)
 			{
 				// no attribute seam, need to check if it's manifold
 				unsigned int openi = openinc[i], openo = openout[i];
@@ -438,6 +433,18 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
 		}
 	}
 
+	if (vertex_lock)
+	{
+		// vertex_lock may lock any wedge, not just the primary vertex, so we need to lock the primary vertex and relock any wedges
+		for (size_t i = 0; i < vertex_count; ++i)
+			if (vertex_lock[sparse_remap ? sparse_remap[i] : i])
+				result[remap[i]] = Kind_Locked;
+
+		for (size_t i = 0; i < vertex_count; ++i)
+			if (result[remap[i]] == Kind_Locked)
+				result[i] = Kind_Locked;
+	}
+
 	if (options & meshopt_SimplifyLockBorder)
 		for (size_t i = 0; i < vertex_count; ++i)
 			if (result[i] == Kind_Border)
@@ -1036,7 +1043,7 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const
 		float ei = quadricError(vertex_quadrics[remap[i0]], vertex_positions[i1]);
 		float ej = quadricError(vertex_quadrics[remap[j0]], vertex_positions[j1]);
 
-#if TRACE >= 2
+#if TRACE >= 3
 		float di = ei, dj = ej;
 #endif
 
@@ -1052,7 +1059,7 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const
 		c.v1 = ei <= ej ? i1 : j1;
 		c.error = ei <= ej ? ei : ej;
 
-#if TRACE >= 2
+#if TRACE >= 3
 		if (i0 == j0) // c.bidi has been overwritten
 			printf("edge eval %d -> %d: error %f (pos %f, attr %f)\n", c.v0, c.v1,
 			    sqrtf(c.error), sqrtf(ei <= ej ? di : dj), sqrtf(ei <= ej ? ei - di : ej - dj));
@@ -1110,7 +1117,7 @@ static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapse
 	}
 }
 
-static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, Quadric* vertex_quadrics, Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, size_t attribute_count, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback, const Vector3* vertex_positions, const EdgeAdjacency& adjacency, size_t triangle_collapse_goal, float error_limit, float& result_error)
+static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback, const Vector3* vertex_positions, const EdgeAdjacency& adjacency, size_t triangle_collapse_goal, float error_limit, float& result_error)
 {
 	size_t edge_collapses = 0;
 	size_t triangle_collapses = 0;
@@ -1186,43 +1193,6 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
 		assert(collapse_remap[r0] == r0);
 		assert(collapse_remap[r1] == r1);
 
-		unsigned int sx = i1;
-
-		// for seam collapses we need to move the seam pair together; this is a bit tricky to compute since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges)
-		if (kind == Kind_Seam)
-		{
-			unsigned int s0 = wedge[i0];
-			unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0];
-			assert(s0 != i0 && wedge[s0] == i0);
-			assert(s1 != ~0u && remap[s1] == r1);
-
-			// additional asserts to verify that the seam pair is consistent
-			assert(kind != vertex_kind[i1] || s1 == wedge[i1]);
-			assert(loop[i0] == i1 || loopback[i0] == i1);
-			assert(loop[s0] == s1 || loopback[s0] == s1);
-
-			// note: this should never happen due to the assertion above, but when disabled if we ever hit this case we'll get a memory safety issue; for now play it safe
-			sx = (s1 != ~0u) ? s1 : wedge[i1];
-		}
-
-		quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]);
-
-		if (attribute_count)
-		{
-			quadricAdd(attribute_quadrics[i1], attribute_quadrics[i0]);
-			quadricAdd(&attribute_gradients[i1 * attribute_count], &attribute_gradients[i0 * attribute_count], attribute_count);
-
-			// note: this is intentionally missing handling for Kind_Complex; we assume that complex vertices have similar attribute values so just using the primary vertex is fine
-			if (kind == Kind_Seam)
-			{
-				// seam collapses involve two edges so we need to update attribute quadrics for both target vertices; position quadrics are shared
-				unsigned int s0 = wedge[i0], s1 = sx;
-
-				quadricAdd(attribute_quadrics[s1], attribute_quadrics[s0]);
-				quadricAdd(&attribute_gradients[s1 * attribute_count], &attribute_gradients[s0 * attribute_count], attribute_count);
-			}
-		}
-
 		if (kind == Kind_Complex)
 		{
 			// remap all vertices in the complex to the target vertex
@@ -1236,10 +1206,19 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
 		}
 		else if (kind == Kind_Seam)
 		{
-			// remap v0 to v1 and seam pair of v0 to seam pair of v1
-			unsigned int s0 = wedge[i0], s1 = sx;
+			// for seam collapses we need to move the seam pair together; this is a bit tricky to compute since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges)
+			unsigned int s0 = wedge[i0];
+			unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0];
 			assert(s0 != i0 && wedge[s0] == i0);
-			assert(remap[s1] == r1);
+			assert(s1 != ~0u && remap[s1] == r1);
+
+			// additional asserts to verify that the seam pair is consistent
+			assert(kind != vertex_kind[i1] || s1 == wedge[i1]);
+			assert(loop[i0] == i1 || loopback[i0] == i1);
+			assert(loop[s0] == s1 || loopback[s0] == s1);
+
+			// note: this should never happen due to the assertion above, but when disabled if we ever hit this case we'll get a memory safety issue; for now play it safe
+			s1 = (s1 != ~0u) ? s1 : wedge[i1];
 
 			collapse_remap[i0] = i1;
 			collapse_remap[s0] = s1;
@@ -1251,6 +1230,8 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
 			collapse_remap[i0] = i1;
 		}
 
+		// note: we technically don't need to lock r1 if it's a locked vertex, as it can't move and its quadric won't be used
+		// however, this results in slightly worse error on some meshes because the locked collapses get an unfair advantage wrt scheduling
 		collapse_locked[r0] = 1;
 		collapse_locked[r1] = 1;
 
@@ -1274,6 +1255,38 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
 	return edge_collapses;
 }
 
+static void updateQuadrics(const unsigned int* collapse_remap, size_t vertex_count, Quadric* vertex_quadrics, Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, size_t attribute_count, const Vector3* vertex_positions, const unsigned int* remap, float& vertex_error)
+{
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		if (collapse_remap[i] == i)
+			continue;
+
+		unsigned int i0 = unsigned(i);
+		unsigned int i1 = collapse_remap[i];
+
+		unsigned int r0 = remap[i0];
+		unsigned int r1 = remap[i1];
+
+		// ensure we only update vertex_quadrics once: primary vertex must be moved if any wedge is moved
+		if (i0 == r0)
+			quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]);
+
+		if (attribute_count)
+		{
+			quadricAdd(attribute_quadrics[i1], attribute_quadrics[i0]);
+			quadricAdd(&attribute_gradients[i1 * attribute_count], &attribute_gradients[i0 * attribute_count], attribute_count);
+
+			if (i0 == r0)
+			{
+				// when attributes are used, distance error needs to be recomputed as collapses don't track it; it is safe to do this after the quadric adjustment
+				float derr = quadricError(vertex_quadrics[r0], vertex_positions[r1]);
+				vertex_error = vertex_error < derr ? derr : vertex_error;
+			}
+		}
+	}
+}
+
 static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const unsigned int* collapse_remap)
 {
 	size_t write = 0;
@@ -1436,7 +1449,7 @@ static void measureComponents(float* component_errors, size_t component_count, c
 	// we've used the output buffer as scratch space, so we need to move the results to proper indices
 	for (size_t i = 0; i < component_count; ++i)
 	{
-#if TRACE > 1
+#if TRACE >= 2
 		printf("component %d: center %f %f %f, error %e\n", int(i),
 		    component_errors[i * 4 + 0], component_errors[i * 4 + 1], component_errors[i * 4 + 2], sqrtf(component_errors[i * 4 + 3]));
 #endif
@@ -1929,6 +1942,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
 
 	size_t result_count = index_count;
 	float result_error = 0;
+	float vertex_error = 0;
 
 	// target_error input is linear; we need to adjust it to match quadricError units
 	float error_scale = (options & meshopt_SimplifyErrorAbsolute) ? vertex_scale : 1.f;
@@ -1961,12 +1975,17 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
 
 		memset(collapse_locked, 0, vertex_count);
 
-		size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, loop, loopback, vertex_positions, adjacency, triangle_collapse_goal, error_limit, result_error);
+		size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, loop, loopback, vertex_positions, adjacency, triangle_collapse_goal, error_limit, result_error);
 
 		// no edges can be collapsed any more due to hitting the error limit or triangle collapse limit
 		if (collapses == 0)
 			break;
 
+		updateQuadrics(collapse_remap, vertex_count, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, vertex_positions, remap, vertex_error);
+
+		// updateQuadrics will update vertex error if we use attributes, but if we don't then result_error and vertex_error are equivalent
+		vertex_error = attribute_count == 0 ? result_error : vertex_error;
+
 		remapEdgeLoops(loop, vertex_count, collapse_remap);
 		remapEdgeLoops(loopback, vertex_count, collapse_remap);
 
@@ -1975,8 +1994,8 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
 
 		result_count = new_count;
 
-		if ((options & meshopt_SimplifyPrune) && result_count > target_index_count && component_nexterror <= result_error)
-			result_count = pruneComponents(result, result_count, components, component_errors, component_count, result_error, component_nexterror);
+		if ((options & meshopt_SimplifyPrune) && result_count > target_index_count && component_nexterror <= vertex_error)
+			result_count = pruneComponents(result, result_count, components, component_errors, component_count, vertex_error, component_nexterror);
 	}
 
 	// we're done with the regular simplification but we're still short of the target; try pruning more aggressively towards error_limit
@@ -2000,6 +2019,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
 
 		result_count = new_count;
 		result_error = result_error < component_maxerror ? component_maxerror : result_error;
+		vertex_error = vertex_error < component_maxerror ? component_maxerror : vertex_error;
 	}
 
 #if TRACE
@@ -2105,7 +2125,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
 		    (triangles <= target_index_count / 3) ? "under" : "over");
 #endif
 
-		float tip = interpolate(float(target_index_count / 3), float(min_grid), float(min_triangles), float(grid_size), float(triangles), float(max_grid), float(max_triangles));
+		float tip = interpolate(float(size_t(target_index_count / 3)), float(min_grid), float(min_triangles), float(grid_size), float(triangles), float(max_grid), float(max_triangles));
 
 		if (triangles <= target_index_count / 3)
 		{
diff --git a/lib/meshoptimizer/vertexcodec.cpp b/lib/meshoptimizer/vertexcodec.cpp
index 17324362..1dbd2e35 100644
--- a/lib/meshoptimizer/vertexcodec.cpp
+++ b/lib/meshoptimizer/vertexcodec.cpp
@@ -90,6 +90,14 @@
 #include <wasm_simd128.h>
 #endif
 
+#ifndef TRACE
+#define TRACE 0
+#endif
+
+#if TRACE
+#include <stdio.h>
+#endif
+
 #ifdef SIMD_WASM
 #define wasmx_splat_v32x4(v, i) wasm_i32x4_shuffle(v, v, i, i, i, i)
 #define wasmx_unpacklo_v8x16(a, b) wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
@@ -135,6 +143,19 @@ inline unsigned char unzigzag8(unsigned char v)
 	return -(v & 1) ^ (v >> 1);
 }
 
+#if TRACE
+struct Stats
+{
+	size_t size;
+	size_t header;  // bytes for header
+	size_t bitg[4]; // bytes for bit groups
+	size_t bitc[8]; // bit consistency: how many bits are shared between all bytes in a group
+};
+
+static Stats* bytestats = NULL;
+static Stats vertexstats[256];
+#endif
+
 static bool encodeBytesGroupZero(const unsigned char* buffer)
 {
 	for (size_t i = 0; i < kByteGroupSize; ++i)
@@ -256,8 +277,16 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end,
 
 		assert(data + best_size == next);
 		data = next;
+
+#if TRACE
+		bytestats->bitg[bitslog2] += best_size;
+#endif
 	}
 
+#if TRACE
+	bytestats->header += header_size;
+#endif
+
 	return data;
 }
 
@@ -286,9 +315,31 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data
 			vertex_offset += vertex_size;
 		}
 
+#if TRACE
+		const unsigned char* olddata = data;
+		bytestats = &vertexstats[k];
+
+		for (size_t ig = 0; ig < vertex_count; ig += kByteGroupSize)
+		{
+			unsigned char last = (ig == 0) ? last_vertex[k] : vertex_data[vertex_size * (ig - 1) + k];
+			unsigned char delta = 0xff;
+
+			for (size_t i = ig; i < ig + kByteGroupSize && i < vertex_count; ++i)
+				delta &= ~(vertex_data[vertex_size * i + k] ^ last);
+
+			for (int j = 0; j < 8; ++j)
+				bytestats->bitc[j] += (vertex_count - ig < kByteGroupSize ? vertex_count - ig : kByteGroupSize) * ((delta >> j) & 1);
+		}
+#endif
+
 		data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
 		if (!data)
 			return NULL;
+
+#if TRACE
+		bytestats = NULL;
+		vertexstats[k].size += data - olddata;
+#endif
 	}
 
 	memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
@@ -1096,6 +1147,10 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con
 	assert(vertex_size > 0 && vertex_size <= 256);
 	assert(vertex_size % 4 == 0);
 
+#if TRACE
+	memset(vertexstats, 0, sizeof(vertexstats));
+#endif
+
 	const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
 
 	unsigned char* data = buffer;
@@ -1148,6 +1203,30 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con
 	assert(data >= buffer + tail_size);
 	assert(data <= buffer + buffer_size);
 
+#if TRACE
+	size_t total_size = data - buffer;
+
+	for (size_t k = 0; k < vertex_size; ++k)
+	{
+		const Stats& vsk = vertexstats[k];
+
+		printf("%2d: %7d bytes [%4.1f%%] %.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
+
+		size_t total_k = vsk.header + vsk.bitg[0] + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[3];
+
+		printf(" |\thdr [%5.1f%%] bitg 1-3 [%4.1f%% %4.1f%% %4.1f%%]",
+		    double(vsk.header) / double(total_k) * 100, double(vsk.bitg[1]) / double(total_k) * 100,
+		    double(vsk.bitg[2]) / double(total_k) * 100, double(vsk.bitg[3]) / double(total_k) * 100);
+
+		printf(" |\tbitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]",
+		    double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100,
+		    double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100,
+		    double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100,
+		    double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100);
+		printf("\n");
+	}
+#endif
+
 	return data - buffer;
 }