feat(math): add chi square calculation function.

Add chi_square_probability function to math_tools module. This function returns the Chi Square distribution probability. Chi-square tests are effective for distinguishing compressed from encrypted data because they evaluate the uniformity of byte distributions more rigorously than Shannon entropy. In compressed files, bytes often cluster around certain values due to patterns that still exist (albeit less detectable), resulting in a non-uniform distribution. Encrypted data, by contrast, exhibits nearly perfect uniformity, as each byte value from 0–255 is expected to appear with almost equal frequency, making it harder to detect any discernible patterns. The chi-square distribution is calculated for the stream of bytes in the chunk and expressed as an absolute number and a percentage which indicates how frequently a truly random sequence would exceed the value calculated. The percentage is the only value that is of interest from unblob's perspective, so that's why we only return it. According to ent doc⁰: > We [can] interpret the percentage as the degree to which the > sequence tested is suspected of being non-random. If the percentage is > greater than 99% or less than 1%, the sequence is almost certainly not > random. If the percentage is between 99% and 95% or between 1% and 5%, > the sequence is suspect. Percentages between 90% and 95% and 5% and 10% > indicate the sequence is “almost suspect”. [0] - https://www.fourmilab.ch/random/
onekey-sec · Oct 30, 2024 · af4bf51 · af4bf51
1 parent 8c6d3fe
commit af4bf51
Show file tree

Hide file tree

Showing 4 changed files with 145 additions and 5 deletions.
diff --git a/benches/benches_main.rs b/benches/benches_main.rs
@@ -26,6 +26,25 @@ fn shannon_entropy(c: &mut Criterion) {
     group.finish();
 }
 
-criterion_group!(benches, shannon_entropy);
+fn chi_square_probability(c: &mut Criterion) {
+    let mut sample = vec![0u8; 1 * MB];
+    StdRng::seed_from_u64(5).fill(&mut sample[..]);
+
+    let mut group = c.benchmark_group("Chi square probability");
+
+    for sample_size in [256, 1 * kB, 64 * kB, 256 * kB, 1 * MB] {
+        group.throughput(Throughput::Bytes(sample_size as u64));
+        group.bench_with_input(
+            BenchmarkId::from_parameter(sample_size),
+            &sample_size,
+            |b, &size| {
+                b.iter(|| unblob_native::math_tools::chi_square_probability(&sample[0..size]));
+            },
+        );
+    }
+    group.finish();
+}
+
+criterion_group!(benches, shannon_entropy, chi_square_probability);
 
 criterion_main!(benches);
diff --git a/python/unblob_native/math_tools.pyi b/python/unblob_native/math_tools.pyi
@@ -1 +1,2 @@
 def shannon_entropy(data: bytes) -> float: ...
+def chi_square_probability(data: bytes) -> float: ...
diff --git a/src/math_tools.rs b/src/math_tools.rs
@@ -1,4 +1,5 @@
 use pyo3::prelude::*;
+use statrs::distribution::{ChiSquared, ContinuousCDF};
 
 pub fn shannon_entropy(data: &[u8]) -> f64 {
     let mut entropy = 0.0;
@@ -25,9 +26,47 @@ pub fn py_shannon_entropy(py: Python, data: &[u8]) -> PyResult<f64> {
     py.allow_threads(|| Ok(shannon_entropy(data)))
 }
 
+pub fn chi_square_probability(data: &[u8]) -> f64 {
+    if data.is_empty() {
+        return 0.0;
+    }
+
+    // Total number of possible byte values (0–255)
+    let num_bins = 256;
+    let expected_count = data.len() as f64 / num_bins as f64;
+
+    // Frequency count for each byte value
+    let mut frequencies = [0u32; 256];
+    for &byte in data {
+        frequencies[byte as usize] += 1;
+    }
+
+    // Calculate chi-square statistic
+    let chi_square: f64 = frequencies
+        .iter()
+        .map(|&obs| {
+            let observed = obs as f64;
+            (observed - expected_count).powi(2) / expected_count
+        })
+        .sum();
+
+    // Degrees of freedom: 255 (256 bins - 1)
+    let degrees_of_freedom = (num_bins - 1) as f64;
+    let chi_squared = ChiSquared::new(degrees_of_freedom).unwrap();
+
+    // Compute p-value (chi-square probability)
+    1.0 - chi_squared.cdf(chi_square)
+}
+/// Calculates Chi Square of data
+#[pyfunction(name = "chi_square_probability")]
+pub fn py_chi_square_probability(py: Python, data: &[u8]) -> PyResult<f64> {
+    py.allow_threads(|| Ok(chi_square_probability(data)))
+}
+
 pub fn init_module(root_module: &Bound<'_, PyModule>) -> PyResult<()> {
     let module = PyModule::new_bound(root_module.py(), "math_tools")?;
     module.add_function(wrap_pyfunction!(py_shannon_entropy, &module)?)?;
+    module.add_function(wrap_pyfunction!(py_chi_square_probability, &module)?)?;
 
     root_module.add_submodule(&module)?;
 
@@ -46,10 +85,69 @@ mod tests {
 
     use super::*;
 
-    #[test]
-    fn test_shannon_entropy() {
-        let input = b"000111"; // 50% entropy distribution ~ 1 bit information
+    mod shannon {
+        use super::*;
+
+        #[test]
+        fn test_shannon_entropy() {
+            let input = b"000111"; // 50% entropy distribution ~ 1 bit information
+
+            assert_relative_eq!(shannon_entropy(input), 1.0);
+        }
+    }
+
+    mod chi_square {
+        use super::*;
+        use rand_core::{OsRng, RngCore};
 
-        assert_relative_eq!(shannon_entropy(input), 1.0);
+        #[test]
+        fn test_non_uniform_distribution() {
+            let uniform_distribution = [0u8; 4096];
+            let chi_square_value = chi_square_probability(&uniform_distribution);
+
+            // Expect chi-square to be relatively low for uniform random data
+            assert_eq!(
+                chi_square_value, 0.0,
+                "Chi-square probability for non uniform distribution should be 0.0"
+            );
+        }
+
+        #[test]
+        fn test_uniform_distribution() {
+            let uniform_distribution: Vec<u8> = (0..=255).collect();
+            let chi_square_value = chi_square_probability(&uniform_distribution);
+
+            // Expect chi-square to be relatively low for uniform random data
+            assert_eq!(
+                chi_square_value, 1.0,
+                "Chi-square probability for non uniform distribution should be 1.0"
+            );
+        }
+
+        #[test]
+        fn test_random_distribution() {
+            let mut random_data = [0u8; 4096];
+            OsRng.fill_bytes(&mut random_data);
+            let chi_square_value = chi_square_probability(&random_data);
+
+            // Expect chi-square to be relatively low for uniform random data
+            assert!(
+                chi_square_value > 0.0 && chi_square_value < 1.0,
+                "Chi-square probability for PRNG distribution should be within bounds"
+            );
+        }
+
+        #[test]
+        fn test_empty_data() {
+            // Edge case for empty data
+            let empty_data: Vec<u8> = Vec::new();
+            let chi_square_value = chi_square_probability(&empty_data);
+
+            // For empty data, chi-square should handle gracefully and return 0.0
+            assert_eq!(
+                chi_square_value, 0.0,
+                "Chi-square probability for empty data should be 0.0"
+            );
+        }
     }
 }
diff --git a/tests/test_math.py b/tests/test_math.py
@@ -2,6 +2,9 @@
 
 from unblob_native import math_tools
 
+UNIFORM_DISTRIBUTION = bytes(x for x in range(256))
+NON_UNIFORM_DISTRIBUTION = bytes([0] * 256)
+
 
 @pytest.mark.parametrize(
     "data,entropy",
@@ -15,3 +18,22 @@
 )
 def test_shannon_entropy(data: bytes, entropy: float):
     assert math_tools.shannon_entropy(data) == pytest.approx(entropy)
+
+
+@pytest.mark.parametrize(
+    "data,chi_square_value",
+    [
+        pytest.param(b"", 0, id="empty"),
+        pytest.param(UNIFORM_DISTRIBUTION, 1.0, id="uniform distribution"),
+        pytest.param(NON_UNIFORM_DISTRIBUTION, 0.0, id="non uniform distribution"),
+        pytest.param(
+            UNIFORM_DISTRIBUTION + NON_UNIFORM_DISTRIBUTION,
+            0.0,
+            id="partially uniform distribution",
+        ),
+    ],
+)
+def test_chi_square_entropy(data: bytes, chi_square_value: float):
+    assert math_tools.chi_square_probability(data) == pytest.approx(
+        chi_square_value, abs=1e-4
+    )
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		def shannon_entropy(data: bytes) -> float: ...
		def chi_square_probability(data: bytes) -> float: ...