-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathanalysis_of_categorical_data.py
84 lines (67 loc) · 2.37 KB
/
analysis_of_categorical_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
# Step 1: Define the expected color distribution percentages (2008 expected percentages)
expected_percentages = {
'Blue': 24,
'Orange': 20,
'Green': 16,
'Yellow': 14,
'Red': 13,
'Brown': 13
}
# Step 2: Define the observed counts from the sample of 410 M&Ms
observed_counts = {
'Blue': 105,
'Orange': 91,
'Green': 70,
'Yellow': 50,
'Red': 45,
'Brown': 49
}
# Total sample size
total_sample_size = 410
# Step 3: Calculate the expected counts
expected_counts = {color: total_sample_size * (percentage / 100) for color, percentage in expected_percentages.items()}
# Step 4: Calculate the chi-square statistic
observed_values = np.array(list(observed_counts.values()))
expected_values = np.array(list(expected_counts.values()))
chi_square_stat, p_value = stats.chisquare(observed_values, expected_values)
# Degrees of freedom: df = number of categories - 1
df = len(expected_percentages) - 1
# Step 5: Determine the critical value for the chi-square distribution
alpha = 0.05
critical_value = stats.chi2.ppf(1 - alpha, df)
# Step 6: Decision Rule
if chi_square_stat > critical_value:
decision = "Reject the null hypothesis: The color distribution has changed since 2008."
else:
decision = "Fail to reject the null hypothesis: No significant change in the color distribution."
# Step 7: Output results
results = {
'Chi-square statistic': chi_square_stat,
'p-value': p_value,
'Critical value': critical_value,
'Decision': decision
}
# Plot the observed vs expected counts for visual comparison
colors = list(expected_counts.keys())
observed_values = list(observed_counts.values())
expected_values = list(expected_counts.values())
fig, ax = plt.subplots(figsize=(10, 6))
bar_width = 0.35
index = np.arange(len(colors))
# Bar plots for observed and expected counts
bar1 = ax.bar(index, observed_values, bar_width, label='Observed', color='blue', alpha=0.7)
bar2 = ax.bar(index + bar_width, expected_values, bar_width, label='Expected', color='orange', alpha=0.7)
# Add labels and title
ax.set_xlabel('M&M Colors', fontsize=12)
ax.set_ylabel('Counts', fontsize=12)
ax.set_title('Observed vs Expected M&M Color Distribution', fontsize=14)
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(colors)
ax.legend()
# Display the plot
plt.tight_layout()
plt.show()
results