forked from AIPI510/aipi510-fall24
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTA5-BadDataBusters-ZScoreApp.py
486 lines (372 loc) · 23.8 KB
/
TA5-BadDataBusters-ZScoreApp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
import streamlit as st
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import plotly.graph_objects as go
import plotly.express as px
# Custom CSS for better styling and visibility
st.markdown("""
<style>
.stApp {
background-color: #1e1e1e; /* Dark background for the main content area */
color: white; /* Set the main text color to white for contrast */
}
/* Sidebar styling */
.css-17eq0hr {
background-color: #252526; /* Dark background for the sidebar */
color: white; /* White text for better readability in sidebar */
}
/* General content text color */
.css-10trblm {
color: white !important; /* Force the content text to be white for consistency */
}
/* Input fields styling */
.css-1cpxqw2 {
background-color: #333333 !important; /* Dark background for input fields */
color: white !important; /* White text within input fields */
}
/* Button styling */
.st-button {
display: inline-block;
padding: 8px 16px;
font-size: 1.2em;
font-weight: bold;
color: #fff; /* White text for buttons */
background-color: #4e8cff; /* Blue background for buttons */
border: none; /* Remove button border */
border-radius: 5px; /* Rounded corners for buttons */
cursor: pointer; /* Show a pointer cursor when hovering over buttons */
margin: 10px; /* Add margin around buttons */
}
</style>
""", unsafe_allow_html=True) # Allow raw HTML/CSS to apply custom styling
# Sidebar for interactive controls and navigation
st.sidebar.title("🎛️ Navigation & Control Panel") # Title for the sidebar panel
# Initialize session state to track progress across modules
if 'progress' not in st.session_state:
st.session_state.progress = 0
# Radio buttons for navigating between different modules
module = st.sidebar.radio(
"Navigate through the modules", # Label for the radio button group
(
"📘 Module 1: Basics", # Basic module introduction
"📈 Module 2: Exploring Distributions", # Focus on data distributions
"🧮 Module 3: Z-Score Applications", # Module to explore Z-Scores
"📊 Module 4: Advanced Topics", # More advanced statistical concepts
"🎥 Module 5: Animated Normal Distributions", # Module with animated visualizations
"🌎 Module 6: Real-World Examples" # Practical, real-world applications of statistics
)
)
# Cacheable function to compute normal distribution
@st.cache_data
def compute_normal_distribution(mean, std_dev):
"""
Computes the x and y values for a normal distribution curve based on the given mean and standard deviation.
Args:
mean (float): The mean of the normal distribution.
std_dev (float): The standard deviation of the normal distribution.
Returns:
tuple: A tuple containing two numpy arrays, x (range of values) and y (probability density function values).
"""
x = np.linspace(mean - 4 * std_dev, mean + 4 * std_dev, 1000) # Generate x values within ±4 standard deviations of the mean
y = stats.norm.pdf(x, mean, std_dev) # Compute corresponding y values for normal distribution using PDF
return x, y
# Module 1: Basics of Z-Scores and Normal Distributions
# This module introduces the fundamental concepts of Z-scores and normal distributions.
# It covers the following key topics:
# 1. **Z-Scores**:
# - A Z-score measures how many standard deviations a given data point is from the mean.
# - The Z-score formula is presented, and its components (X, mean, and standard deviation) are explained.
# 2. **Normal Distribution**:
# - Explains the properties of a normal distribution, including symmetry, the alignment of mean, median, and mode,
# and how the standard deviation determines the spread of the data.
# - Visualizes the standard normal distribution with a bell curve, highlighting key points like the mean and ±1 standard deviation.
# 3. **Interactive Visualization**:
# - A static visualization is generated using Plotly to show the standard normal distribution curve.
# - Key statistical markers such as the mean and ±1 standard deviation are also indicated on the plot.
# This module is designed to provide users with an intuitive understanding of Z-scores and the normal distribution,
# preparing them for more advanced topics in subsequent modules.
# Progress is tracked using `st.session_state` to ensure users' progress through the modules is saved.
if module == "📘 Module 1: Basics":
st.header("📘 Module 1: Basics of Z-Scores and Normal Distributions") # Header for the first module
# Subsection explaining Z-scores
st.subheader("🔍 What is a Z-Score?")
st.write("""
A **Z-score** measures how many standard deviations a data point is from the mean. It's a way to standardize different datasets for easy comparison.
The formula for Z-score is:
""")
# Latex formula for Z-score
st.latex(r"Z = \frac{X - \mu}{\sigma}") # Displays the Z-score formula using LaTeX
# Explanation of the components in the Z-score formula
st.write("""
Where:
- $X$ is the value of interest.
- $\mu$ is the mean.
- $\sigma$ is the standard deviation.
""")
# Subsection explaining normal distribution
st.subheader("📊 What is a Normal Distribution?")
st.write("""
A **normal distribution** is a bell-shaped curve that shows how data spreads around the mean.
Some key characteristics of normal distributions include:
- **Symmetry**: The left and right sides are mirror images.
- **Mean, Median, and Mode** are the same.
- **Standard Deviation**: Determines the spread of the data around the mean.
""")
# Generate and display a static visualization of the standard normal distribution
x, y = compute_normal_distribution(0, 1) # Compute x and y values for a standard normal distribution (mean = 0, std dev = 1)
# Create a Plotly figure to visualize the standard normal distribution
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y, mode='lines', name='Standard Normal Distribution')) # Plot the main distribution curve
fig.add_trace(go.Scatter(x=[-1, -1], y=[0, stats.norm.pdf(-1, 0, 1)], mode='lines', name='-1σ')) # Add marker for -1 standard deviation
fig.add_trace(go.Scatter(x=[1, 1], y=[0, stats.norm.pdf(1, 0, 1)], mode='lines', name='+1σ')) # Add marker for +1 standard deviation
fig.add_trace(go.Scatter(x=[0, 0], y=[0, stats.norm.pdf(0, 0, 1)], mode='lines', name='Mean')) # Mark the mean at Z = 0
# Update layout with titles and axis labels
fig.update_layout(title='📈 Standard Normal Distribution',
xaxis_title='Z-Score', # Label for x-axis
yaxis_title='Probability Density') # Label for y-axis
st.plotly_chart(fig) # Display the Plotly chart in the Streamlit app
# Provide a summary of the content covered
st.write("You’ve now learned about the basics of Z-scores and the normal distribution. In the next section, we’ll dive deeper and explore how changing parameters affects the distribution.")
# Update progress tracking to indicate Module 1 has been completed (or revisited)
st.session_state.progress = max(st.session_state.progress, 1) # Ensure progress is recorded without decreasing it if revisited
# Module 2: Exploring Normal Distributions with Plotly
# This module allows users to interactively explore how changing the mean (μ) and standard deviation (σ) affects
# the shape and position of a normal distribution. The module features:
# 1. **Interactive Controls**:
# - Users can adjust the **mean** and **standard deviation** of the normal distribution using sliders in the sidebar.
# - The changes are reflected in real-time on an interactive Plotly graph, allowing users to see the immediate effect of these parameters.
# 2. **Normal Distribution Plot**:
# - A normal distribution is plotted based on the user-selected values of μ and σ.
# - The x-axis is fixed to range from -20 to 20, and the y-axis is fixed to a common probability density range (0 to 0.5) to keep the visual scale consistent.
# - The plot is updated dynamically as the user adjusts the mean and standard deviation.
# 3. **Detailed Explanation**:
# - Provides insights into how the **mean (μ)** shifts the center of the distribution along the x-axis while keeping the shape of the distribution unchanged.
# - Explains the role of **standard deviation (σ)** in controlling the spread of the distribution:
# - A smaller σ results in a narrow, tall peak (data clustered near the mean).
# - A larger σ results in a wider, flatter curve (data more spread out).
# - Discusses key statistical properties such as the **Empirical Rule** (68-95-99.7 rule) and how much data lies within 1, 2, or 3 standard deviations from the mean.
# 4. **Real-World Interpretation**:
# - Offers practical examples, showing how variations in σ indicate the degree of spread or variability in datasets like exam scores, stock prices, or patient outcomes.
# Progress is updated using `st.session_state`, advancing the progress tracker to mark the completion of Module 2.
if module == "📈 Module 2: Exploring Distributions":
st.header("📈 Module 2: Exploring Normal Distributions")
st.subheader("🔧 Adjusting Mean and Standard Deviation")
st.write("""
Now, let’s interactively explore the effect of changing the **mean** and **standard deviation** of a normal distribution.
Move the sliders below to adjust these parameters.
""")
# Sidebar inputs for interactive normal distribution
mean = st.sidebar.slider("Mean (μ)", min_value=-10.0, max_value=10.0, value=st.session_state.get("mean", 0.0), step=0.1)
std_dev = st.sidebar.slider("Standard Deviation (σ)", min_value=0.1, max_value=5.0, value=st.session_state.get("std_dev", 1.0), step=0.1)
# Fixed x-axis for the plot
x = np.linspace(-20, 20, 1000)
# Calculate the y-values (probability density function)
y = stats.norm.pdf(x, mean, std_dev)
# Create a Plotly figure
fig = go.Figure()
# Add trace for the normal distribution
fig.add_trace(go.Scatter(x=x, y=y, mode='lines', name=f'N(μ={mean:.2f}, σ={std_dev:.2f})'))
# Update layout for fixed axes
fig.update_layout(
title=f'Normal Distribution (μ={mean:.2f}, σ={std_dev:.2f})',
xaxis_title='X-axis',
yaxis_title='Probability Density',
xaxis=dict(range=[-20, 20]), # Fixed range for x-axis
yaxis=dict(range=[0, 0.5]), # Fixed range for y-axis based on common normal distribution ranges
showlegend=True
)
# Display the interactive plotly chart
st.plotly_chart(fig)
# Detailed insights on how mean and standard deviation affect the distribution
st.write(f"""
### Detailed Insights:
#### Mean (μ):
- The **mean (μ)** represents the center of the distribution.
- As the **mean** shifts, the peak of the distribution moves **left or right** along the x-axis, but the shape of the distribution remains the same.
- For example, with **μ = {mean:.2f}**, the distribution is centered at **{mean:.2f}**.
- The mean is also the **balance point** of the distribution, which means that half of the data lies to the left of this value, and half lies to the right.
#### Standard Deviation (σ):
- The **standard deviation (σ)** controls the **spread** or **width** of the distribution.
- A **smaller σ** results in a **narrow, tall** curve, meaning the data is more tightly clustered around the mean.
- A **larger σ** produces a **wider, flatter** curve, indicating that the data is more spread out from the mean.
- With **σ = {std_dev:.2f}**, the curve shows that approximately:
- **68%** of the data lies within **1σ** of the mean (from {mean - std_dev:.2f} to {mean + std_dev:.2f}).
- **95%** of the data lies within **2σ** of the mean (from {mean - 2 * std_dev:.2f} to {mean + 2 * std_dev:.2f}).
- **99.7%** of the data lies within **3σ** of the mean (from {mean - 3 * std_dev:.2f} to {mean + 3 * std_dev:.2f}).
- This is known as the **Empirical Rule**, and it highlights how data is distributed across the curve.
#### Shape of the Curve:
- **Symmetry**: The normal distribution is symmetric around the mean, meaning that the left and right sides of the curve are mirror images.
- The height of the peak represents the **probability density** at the mean.
- As you change **σ**, the height and width of the curve adjust. A smaller standard deviation results in a **taller** and **narrower** peak, indicating that most of the data is concentrated around the mean.
- As **σ** increases, the curve becomes **flatter**, indicating that data is more spread out, with higher variability.
#### Real-World Interpretations:
- If the standard deviation is small, it suggests that data points (e.g., exam scores, stock prices, or patient outcomes) are consistently close to the mean.
- If the standard deviation is large, it indicates greater variation or uncertainty in the data, meaning values are more spread out from the average.
""")
# Update progress
st.session_state.progress = max(st.session_state.progress, 2)
# Module 3: Z-Score Applications with Separate Sliders for Mean and Std Dev
# This module introduces practical applications of Z-scores, allowing users to calculate Z-scores based on custom inputs
# and explore their significance in terms of percentiles and outlier detection.
if module == "🧮 Module 3: Z-Score Applications":
st.header("🧮 Module 3: Z-Score Applications")
# Independent sliders for mean and std_dev in Module 3
st.subheader("Set Mean and Standard Deviation for Z-Score Calculation")
mean = st.slider("Mean (μ)", min_value=-10.0, max_value=10.0, value=0.0, step=0.1)
std_dev = st.slider("Standard Deviation (σ)", min_value=0.1, max_value=5.0, value=1.0, step=0.1)
st.subheader("🧮 Z-Score Calculator")
st.write("Now that you understand Z-scores, let’s calculate one based on your inputs.")
input_value = st.number_input("Enter a value to calculate its Z-score", value=0.0)
input_z_score = (input_value - mean) / std_dev
st.success(f"The Z-Score of {input_value} is: **{input_z_score:.2f}**")
st.subheader("Z-Scores and Percentiles")
st.write("""
Z-scores can be mapped to **percentiles**. For example:
- A Z-score of 0 is the 50th percentile (mean of the distribution).
- A Z-score of +1.96 is roughly the 97.5th percentile.
- A Z-score of -1.96 is roughly the 2.5th percentile.
""")
st.subheader("🕵️ Outlier Detection Using Z-Scores")
st.write("""
A Z-score helps detect **outliers**—data points far away from the mean. Common thresholds are:
- Z-scores > +2 or Z-scores < -2 may indicate **outliers**.
- Z-scores > +3 or Z-scores < -3 are **extreme outliers**.
""")
# Update progress
st.session_state.progress = max(st.session_state.progress, 3)
# Module 4: Advanced Topics (Central Limit Theorem, Applications) with Varying Sample Size
# This module demonstrates the Central Limit Theorem (CLT) in action by allowing users to explore how increasing the sample size
# affects the distribution of sample means.
# Key Features:
# 1. **Interactive Controls**:
# - Users can adjust the **mean** and **standard deviation** for generating a non-normal population (Exponential distribution).
# - The **sample size** can also be varied using a slider.
# 2. **Central Limit Theorem Visualization**:
# - A non-normal distribution is generated, and multiple samples are drawn to compute their means.
# - The resulting distribution of sample means is visualized using a histogram, demonstrating how the sample means approximate a normal distribution as sample size increases.
# 3. **Insights**:
# - Explanation of how the **sample size** affects the distribution of sample means.
# - Highlights the core principle of CLT: regardless of the population distribution, the distribution of sample means approaches normal as the sample size increases.
if module == "📊 Module 4: Advanced Topics":
st.header("📊 Module 4: Central Limit Theorem (CLT)")
st.subheader("Set Mean and Standard Deviation for CLT Analysis")
# Independent sliders for Module 4 (mean and standard deviation)
mean_4 = st.sidebar.slider("Mean (μ) for Module 4", min_value=-10.0, max_value=10.0, value=0.0, step=0.1)
std_dev_4 = st.sidebar.slider("Standard Deviation (σ) for Module 4", min_value=0.1, max_value=5.0, value=1.0, step=0.1)
# Slider for sample size
sample_size = st.sidebar.slider("Sample Size", min_value=5, max_value=500, value=30, step=5)
st.subheader("🔄 Central Limit Theorem (CLT) in Action")
st.write(f"""
The **Central Limit Theorem (CLT)** states that as the **sample size increases**, the distribution of the sample mean
approaches a **normal distribution**—even if the underlying population is not normal.
Let's generate a **non-normal distribution** and demonstrate how increasing the sample size affects the distribution of the sample means.
- Population Distribution: Exponential distribution with mean = {mean_4:.2f} and standard deviation = {std_dev_4:.2f}
""")
# Generate non-normal data (Exponential distribution) based on the mean and std_dev
population_data = np.random.exponential(scale=std_dev_4, size=1000) + mean_4 # Exponential shifted by the mean
# Calculate sample means with varying sample sizes
sample_means = [np.mean(np.random.choice(population_data, sample_size, replace=True)) for _ in range(500)]
# Plot the distribution of the sample means
fig_clt = px.histogram(sample_means, nbins=30, title=f"Distribution of Sample Means (Sample Size = {sample_size})")
fig_clt.update_layout(
xaxis_title='Sample Mean',
yaxis_title='Frequency',
showlegend=False,
xaxis=dict(range=[-5, 5]),
)
st.plotly_chart(fig_clt)
st.write(f"""
### Insights:
- With a **sample size of {sample_size}**, you can see how the sample means start to form a distribution that looks increasingly like a **normal distribution**.
- As you increase the sample size, the distribution of sample means becomes narrower, centered around the true mean of the population.
- This demonstrates the core idea of the **Central Limit Theorem**: regardless of the population distribution, the **distribution of sample means** approaches a normal distribution as the sample size increases.
""")
# Update progress
st.session_state.progress = max(st.session_state.progress, 4)
# Module 5: Plotly Animation of Varying Mean and Standard Deviation
# This module features an interactive animation that dynamically demonstrates how changing the **mean** (μ) and **standard deviation** (σ) affects the shape and position of a normal distribution.
# Key Features:
# 1. **Animation**:
# - The animation shows how the normal distribution curve shifts horizontally with changes in the mean and how the curve flattens or narrows as the standard deviation changes.
# 2. **Interactive Plotly Chart**:
# - The chart is animated with 50 frames to smoothly transition through varying mean and standard deviation values.
# - The layout includes a play/pause button, allowing users to control the animation.
# 3. **Visual Insights**:
# - Users can observe how increasing the mean shifts the distribution along the x-axis, while increasing the standard deviation makes the curve wider and flatter, indicating more variability.
if module == "🎥 Module 5: Animated Normal Distributions":
st.header("🎥 Module 5: Animated Normal Distributions")
st.subheader("See How Varying Mean and Standard Deviation Affects the Curve")
st.write("""
This animation demonstrates how varying the **mean (μ)** and **standard deviation (σ)** dynamically affects the shape of the normal distribution.
Watch how the curve changes as these two parameters are adjusted over time.
""")
# Parameters to vary
mean_values = np.linspace(-5, 5, 50) # 50 frames for mean variation
std_dev_values = np.linspace(0.5, 2.5, 50) # 50 frames for std dev variation
# Generate x-values for plotting
x = np.linspace(-20, 20, 1000)
# Create figure for animation
fig = go.Figure()
# Initial frame
initial_mean = mean_values[0]
initial_std_dev = std_dev_values[0]
y_initial = stats.norm.pdf(x, initial_mean, initial_std_dev)
fig.add_trace(go.Scatter(x=x, y=y_initial, mode='lines', name=f'μ={initial_mean}, σ={initial_std_dev}'))
# Create animation frames
frames = []
for i, (mean, std_dev) in enumerate(zip(mean_values, std_dev_values)):
y = stats.norm.pdf(x, mean, std_dev)
frames.append(go.Frame(data=[go.Scatter(x=x, y=y)], name=f'Frame {i}'))
# Update layout for animation
fig.update_layout(
title="Animation of Normal Distribution Varying Mean and Standard Deviation",
xaxis_title="X-axis",
yaxis_title="Probability Density",
xaxis=dict(range=[-20, 20]), # Fixed range for x-axis
yaxis=dict(range=[0, 0.5]), # Fixed range for y-axis based on common normal distribution ranges
updatemenus=[dict(type="buttons",
buttons=[dict(label="Play",
method="animate",
args=[None, {"frame": {"duration": 100, "redraw": True}, "fromcurrent": True}]),
dict(label="Pause",
method="animate",
args=[[None], {"frame": {"duration": 0, "redraw": False}, "mode": "immediate"}])])]
)
# Add frames to the figure
fig.frames = frames
# Display the animated plotly chart
st.plotly_chart(fig)
st.write("""
As you can see:
- **Increasing the mean (μ)** shifts the peak of the distribution to the right.
- **Increasing the standard deviation (σ)** flattens and widens the curve, indicating more variability.
""")
# Update progress
st.session_state.progress = max(st.session_state.progress, 5)
# Module 6: Real-World Examples
# This module introduces real-world examples of Z-scores in various fields, providing practical context to the theoretical concepts learned in previous modules.
if module == "🌎 Module 6: Real-World Examples":
st.header("🌎 Module 6: Real-World Applications of Z-Scores")
st.subheader("📈 Finance")
st.write("""
In finance, Z-scores are used to **standardize returns** and identify outliers or extreme values in stock price movements.
For example, a Z-score of +3 might indicate that a stock’s price is unusually high relative to its historical mean.
""")
st.subheader("⚕️ Healthcare")
st.write("""
Z-scores are often used in **healthcare** to assess how a patient’s test results compare to the population.
For example, a Z-score can help doctors identify if a patient’s cholesterol level is significantly higher or lower than average.
""")
st.subheader("📊 Research")
st.write("""
Z-scores are a key tool in **scientific research** for comparing results across different experiments or datasets, standardizing data from different populations, and detecting outliers.
""")
# Update progress
st.session_state.progress = max(st.session_state.progress, 6)
### Progress Tracker in Sidebar
st.sidebar.markdown("---")
st.sidebar.subheader("🚀 Your Learning Progress")
# Progress bar based on modules completed
progress = st.sidebar.progress(st.session_state.progress / 6)
st.sidebar.write(f"Modules Completed: {st.session_state.progress}/6")