forked from AIPI510/aipi510-fall24
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunpaired-t-test.py
179 lines (144 loc) · 8.33 KB
/
unpaired-t-test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import numpy as np
import scipy.stats
import seaborn as sns
import matplotlib.pyplot as plt
import argparse
import pytest
def get_cli_argument():
"""ArgumentParser to accept inputs from user via command line: sample size, mean & variance for the two randomly generated sample groups"""
parser = argparse.ArgumentParser(description="""Create two sample groups with random normal distribution based on specified mean, variance, and sample size,
then perform unpaired t-test to assess whether there is a significant difference in the mean of the two sample groups""")
parser.add_argument("--n1", type=int, dest="n1", default = "100", help="Sample Group 1: Sample Size")
parser.add_argument("--n2", type=int, dest="n2", default = "100", help="Sample Group 2: Sample Size")
parser.add_argument("--m1", type=float, dest="m1", default = "10", help="Sample Group 1: Mean")
parser.add_argument("--m2", type=float, dest="m2", default = "10", help="Sample Group 2: Mean")
parser.add_argument("--v1", type=float, dest="v1", default = "1", help="Sample Group 1: Variance")
parser.add_argument("--v2", type=float, dest="v2", default = "1", help="Sample Group 2: Variance")
args = parser.parse_args()
return args
def create_random_samples(n,m,v):
"""Generate a normally-distributed random sample group, with mean m, variance v, and sample size n"""
sample = np.random.normal(loc=m, scale=v, size=n)
# Print sample size, and resulting mean/variance
print("Sample Size = ", n)
print("Mean = ", "{:.4f}".format(np.mean(sample)))
print("Variance = ", "{:.4f}".format(np.var(sample)))
return sample
def normality_test(sample, shapiro_alpha):
"""Perform Shapiro-Wilk Test of Normality based on given sample group, and alpha"""
shapiro_test_stat, shapiro_p_value = scipy.stats.shapiro(sample)
# Interpretation
if shapiro_p_value > shapiro_alpha:
print("Fail to reject Null Hypothesis: Sample is from the normal distributions")
sample_is_normal = True
else:
print("Reject Null Hypothesis: Sample is NOT from the normal distributions")
sample_is_normal = False
return sample_is_normal
def levene_test(sample1, sample2, levene_alpha):
"""Perform Levene's test for homogeneity of variances based on two given sample groups, and alpha"""
levene_w_stats, levene_p_value = scipy.stats.levene(sample1, sample2, center='mean')
# Interpretation
if levene_p_value > levene_alpha:
print("Fail to reject Null Hypothesis: Homogeneity of Variances")
variance_is_similar = True
else:
print("Reject Null Hypothesis: No Homogeneity of Variances")
variance_is_similar = False
return variance_is_similar
def unpaired_t_test(sample1, sample2, df, p_value_critical):
"""Perform Unpaired t-test based on two given sample groups, degrees of freedom, and critical p-value"""
# Calculate t-statistic using scipy
t_statistic, p_value = scipy.stats.ttest_ind(a=sample1, b=sample2, equal_var=True)
print("\nCalculation of t-value:")
print("t-statistic = ", "{:.4f}".format(t_statistic))
# Calculate t-critial from given p-value and degrees of freedom
# Two-tailed test
t_critical = scipy.stats.t.ppf(q=1-p_value_critical/2,df=df)
print("\nCalculation of critical t-value:")
print("At p-value of ", "{:.2f}".format(p_value_critical))
print("t-critical = ", "{:.4f}".format(t_critical))
# Interpretation
print("\nInterpretation:")
if t_statistic > t_critical:
print("Reject H0: There is significant difference in the mean of the two sample groups")
reject_h0 = True
else:
print("Fail to reject H0: There is NO significant difference in the mean of the two sample groups")
reject_h0 = False
return reject_h0
# Test Functions
def test_create_random_samples():
"""Test creating a random normal sample group, with sample size of 100, mean of 10.0, and variance of 1.0, then check if resulting random sample group actually has 100 samples."""
assert len(create_random_samples(100,10.0,1.0)) == 100
def test_normality_test():
"""Test generating a random normal sample group, with sample size of 1000, mean of 10.0, and variance of 1.0, then perform Normality Test.
As the sample size is not small, we can expect that the resulting sample group should be approximately normally distributed"""
assert normality_test(create_random_samples(1000,10.0,1.0), 0.05) == True
def test_levene_test():
"""Test generating two random normal samples, both with sample size of 1000, mean of 10.0, and variance of 1.0, then perform Levene's Test.
As the sample size of the two sample groups are not small, we can expect that the variance of the two random sample groups should be similar"""
assert levene_test(create_random_samples(1000,10.0,1.0), create_random_samples(1000,10.0,1.0), 0.05) == True
def test_unpaired_t_test():
"""Test generating two random normal sample groups, both with sample size of 1000, mean of 10.0, and variance of 1.0, then perform Unpaired t-test.
As the sample size of the two sample groups are not small, we can expect that there is NO significant difference in the mean of the two sample groups (Fail to reject H0)"""
assert unpaired_t_test(create_random_samples(1000,10.0,1.0), create_random_samples(1000,10.0,1.0), 10000-2, 0.05) == False
def main():
"""Get sample size, mean & variance, create two random sample groups, then perform unpaired t-test"""
# Get all user's arguments
args = get_cli_argument()
n1 = args.n1
n2 = args.n2
m1 = args.m1
m2 = args.m2
v1 = args.v1
v2 = args.v2
# Create normally-distributed random sample1/sample2 groups based on given sample size, mean, and variance
print("\nSample Group 1")
sample1 = create_random_samples(n1,m1,v1)
print("\nSample Group 2")
sample2 = create_random_samples(n2,m2,v2)
# Calculate degrees of freedom
df = min(n1,n2) - 2
# Plot histograms of the two sample groups to see if they are Gaussian-like
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
sns.histplot(data=sample1, bins = 20, ax=axes[0], color='blue')
axes[0].set_title('Sample Group 1 - Histogram')
sns.histplot(data=sample2, bins = 20, ax=axes[1], color='orange')
axes[1].set_title('Sample Group 2 - Histogram')
sns.histplot(data=sample1, bins = 20, ax=axes[2], color='blue')
sns.histplot(data=sample2, bins = 20, ax=axes[2], color='orange')
axes[2].set_title('Sample Group 1 (blue) and Sample Group 2 (orange)')
fig.suptitle('Histograms of the two Sample Groups')
plt.show()
# Normality Test: Check Assumption of Normality using Shapiro-Wilk Test of Normality
print("\nCheck Assumption of Normality:")
shapiro_alpha = 0.05
print("Sample Group 1")
sample_is_normal1 = normality_test(sample1, shapiro_alpha)
print("Sample Group 2")
sample_is_normal2 = normality_test(sample2, shapiro_alpha)
# Levene's Test: Check Assumption of Homogeneity of Variances
print("\nCheck Assumption of Homogeneity of Variances using Levene's Test:")
levene_alpha = 0.05
variance_is_similar = levene_test(sample1, sample2, levene_alpha)
# Perform unpaired t-test if both sample groups are approximately normally distributed, and variance are similar
print("\nUnpaired t-test")
print("H0 (Null Hypotheses): There is NO significant difference in the mean of the two sample groups")
print("H1 (Alternative Hypotheses): There is significant difference in the mean of the two sample groups")
if sample_is_normal1 and sample_is_normal2 and variance_is_similar:
print("\nBoth sample groups are approximately normally distributed, and variance are similar.")
print("Unpaired t-test will then be performed")
p_value_critical = 0.05
reject_h0 = unpaired_t_test(sample1, sample2, df, p_value_critical)
else:
print("\nAssumptions for the unpaired t-test are not all satisfied.")
print("Unpaired t-test will NOT be performed")
if not sample_is_normal1:
print("Sample 1 is not from the normal distributions")
if not sample_is_normal2:
print("Sample 2 is not from the normal distributions")
if not variance_is_similar:
print("No Homogeneity of Variances")
if __name__ == '__main__':
main()