Skip to content

Commit

Permalink
feat(math): add chi square calculation function.
Browse files Browse the repository at this point in the history
Add chi_square_probability function to math_tools module. This function
returns the Chi Square distribution probability.

Chi-square tests are effective for distinguishing compressed from
encrypted data because they evaluate the uniformity of byte
distributions more rigorously than Shannon entropy.

In compressed files, bytes often cluster around certain values due to
patterns that still exist (albeit less detectable), resulting in a
non-uniform distribution. Encrypted data, by contrast, exhibits nearly
perfect uniformity, as each byte value from 0–255 is expected to appear
with almost equal frequency, making it harder to detect any discernible
patterns.

The chi-square distribution is calculated for the stream of bytes in the
chunk and expressed as an absolute number and a percentage which
indicates how frequently a truly random sequence would exceed the value
calculated. The percentage is the only value that is of interest from
unblob's perspective, so that's why we only return it.

According to ent doc⁰:

> We [can] interpret the percentage as the degree to which the
> sequence tested is suspected of being non-random. If the percentage is
> greater than 99% or less than 1%, the sequence is almost certainly not
> random. If the percentage is between 99% and 95% or between 1% and 5%,
> the sequence is suspect. Percentages between 90% and 95% and 5% and 10%
> indicate the sequence is “almost suspect”.

[0] - https://www.fourmilab.ch/random/
  • Loading branch information
qkaiser committed Oct 30, 2024
1 parent 8c6d3fe commit af4bf51
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 5 deletions.
21 changes: 20 additions & 1 deletion benches/benches_main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,25 @@ fn shannon_entropy(c: &mut Criterion) {
group.finish();
}

criterion_group!(benches, shannon_entropy);
fn chi_square_probability(c: &mut Criterion) {
let mut sample = vec![0u8; 1 * MB];
StdRng::seed_from_u64(5).fill(&mut sample[..]);

let mut group = c.benchmark_group("Chi square probability");

for sample_size in [256, 1 * kB, 64 * kB, 256 * kB, 1 * MB] {
group.throughput(Throughput::Bytes(sample_size as u64));
group.bench_with_input(
BenchmarkId::from_parameter(sample_size),
&sample_size,
|b, &size| {
b.iter(|| unblob_native::math_tools::chi_square_probability(&sample[0..size]));
},
);
}
group.finish();
}

criterion_group!(benches, shannon_entropy, chi_square_probability);

criterion_main!(benches);
1 change: 1 addition & 0 deletions python/unblob_native/math_tools.pyi
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
def shannon_entropy(data: bytes) -> float: ...
def chi_square_probability(data: bytes) -> float: ...
106 changes: 102 additions & 4 deletions src/math_tools.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use pyo3::prelude::*;
use statrs::distribution::{ChiSquared, ContinuousCDF};

pub fn shannon_entropy(data: &[u8]) -> f64 {
let mut entropy = 0.0;
Expand All @@ -25,9 +26,47 @@ pub fn py_shannon_entropy(py: Python, data: &[u8]) -> PyResult<f64> {
py.allow_threads(|| Ok(shannon_entropy(data)))
}

pub fn chi_square_probability(data: &[u8]) -> f64 {
if data.is_empty() {
return 0.0;
}

// Total number of possible byte values (0–255)
let num_bins = 256;
let expected_count = data.len() as f64 / num_bins as f64;

// Frequency count for each byte value
let mut frequencies = [0u32; 256];
for &byte in data {
frequencies[byte as usize] += 1;
}

// Calculate chi-square statistic
let chi_square: f64 = frequencies
.iter()
.map(|&obs| {
let observed = obs as f64;
(observed - expected_count).powi(2) / expected_count
})
.sum();

// Degrees of freedom: 255 (256 bins - 1)
let degrees_of_freedom = (num_bins - 1) as f64;
let chi_squared = ChiSquared::new(degrees_of_freedom).unwrap();

// Compute p-value (chi-square probability)
1.0 - chi_squared.cdf(chi_square)
}
/// Calculates Chi Square of data
#[pyfunction(name = "chi_square_probability")]
pub fn py_chi_square_probability(py: Python, data: &[u8]) -> PyResult<f64> {
py.allow_threads(|| Ok(chi_square_probability(data)))
}

pub fn init_module(root_module: &Bound<'_, PyModule>) -> PyResult<()> {
let module = PyModule::new_bound(root_module.py(), "math_tools")?;
module.add_function(wrap_pyfunction!(py_shannon_entropy, &module)?)?;
module.add_function(wrap_pyfunction!(py_chi_square_probability, &module)?)?;

root_module.add_submodule(&module)?;

Expand All @@ -46,10 +85,69 @@ mod tests {

use super::*;

#[test]
fn test_shannon_entropy() {
let input = b"000111"; // 50% entropy distribution ~ 1 bit information
mod shannon {
use super::*;

#[test]
fn test_shannon_entropy() {
let input = b"000111"; // 50% entropy distribution ~ 1 bit information

assert_relative_eq!(shannon_entropy(input), 1.0);
}
}

mod chi_square {
use super::*;
use rand_core::{OsRng, RngCore};

assert_relative_eq!(shannon_entropy(input), 1.0);
#[test]
fn test_non_uniform_distribution() {
let uniform_distribution = [0u8; 4096];
let chi_square_value = chi_square_probability(&uniform_distribution);

// Expect chi-square to be relatively low for uniform random data
assert_eq!(
chi_square_value, 0.0,
"Chi-square probability for non uniform distribution should be 0.0"
);
}

#[test]
fn test_uniform_distribution() {
let uniform_distribution: Vec<u8> = (0..=255).collect();
let chi_square_value = chi_square_probability(&uniform_distribution);

// Expect chi-square to be relatively low for uniform random data
assert_eq!(
chi_square_value, 1.0,
"Chi-square probability for non uniform distribution should be 1.0"
);
}

#[test]
fn test_random_distribution() {
let mut random_data = [0u8; 4096];
OsRng.fill_bytes(&mut random_data);
let chi_square_value = chi_square_probability(&random_data);

// Expect chi-square to be relatively low for uniform random data
assert!(
chi_square_value > 0.0 && chi_square_value < 1.0,
"Chi-square probability for PRNG distribution should be within bounds"
);
}

#[test]
fn test_empty_data() {
// Edge case for empty data
let empty_data: Vec<u8> = Vec::new();
let chi_square_value = chi_square_probability(&empty_data);

// For empty data, chi-square should handle gracefully and return 0.0
assert_eq!(
chi_square_value, 0.0,
"Chi-square probability for empty data should be 0.0"
);
}
}
}
22 changes: 22 additions & 0 deletions tests/test_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

from unblob_native import math_tools

UNIFORM_DISTRIBUTION = bytes(x for x in range(256))
NON_UNIFORM_DISTRIBUTION = bytes([0] * 256)


@pytest.mark.parametrize(
"data,entropy",
Expand All @@ -15,3 +18,22 @@
)
def test_shannon_entropy(data: bytes, entropy: float):
assert math_tools.shannon_entropy(data) == pytest.approx(entropy)


@pytest.mark.parametrize(
"data,chi_square_value",
[
pytest.param(b"", 0, id="empty"),
pytest.param(UNIFORM_DISTRIBUTION, 1.0, id="uniform distribution"),
pytest.param(NON_UNIFORM_DISTRIBUTION, 0.0, id="non uniform distribution"),
pytest.param(
UNIFORM_DISTRIBUTION + NON_UNIFORM_DISTRIBUTION,
0.0,
id="partially uniform distribution",
),
],
)
def test_chi_square_entropy(data: bytes, chi_square_value: float):
assert math_tools.chi_square_probability(data) == pytest.approx(
chi_square_value, abs=1e-4
)

0 comments on commit af4bf51

Please sign in to comment.