You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
181 KiB
181 KiB
None
<html lang="en">
<head>
</head>
</html>
In [17]:
# Cell 1 - Imports and reproducibility
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
rng = np.random.default_rng(2026)
In [18]:
# Cell 2 - Define a discrete RV with PMF table
values = np.array([0, 1, 2]) # outcomes
probs = np.array([0.2, 0.5, 0.3]) #
assert np.all(probs>=0), "Probabilities must be nonnegative"
assert np.isclose(probs.sum(), 1.0), "Probabilities must sum to 1"
$E[X]=\sum_x x\, p(x)$
$\text{Var}(X)=E[X^2]-\mu^2$
In [19]:
# Cell 3 - Anlytical E[X], Var(X)
EX = np.sum(values*probs)
EX2 = np.sum((values**2)*probs)
VarX = EX2-EX**2
SDX = np.sqrt(VarX)
EX, VarX, SDX
Out[19]:
In [20]:
# Cell 4 - Monte Carlo Sampling of X
# Empirical mean and variance
N = 50_000
X = rng.choice(values, size=N, p=probs)
plt.plot(X[0:50], '.k')
plt.show()
eMean = X.mean()
eVar = X.var(ddof=0)
eSD = np.sqrt(eVar)
eMean, eVar, eSD
Out[20]:
In [21]:
X[0:9]
Out[21]:
In [22]:
# Cell 5 - Running mean (LLN intuition):
# \bar{X}_n vs n
rMean = np.cumsum(X) / np.arange(1,N+1)
plt.plot(rMean, '.r', label="Running mean")
plt.grid()
plt.xlim([0,500])
plt.axhline(EX, color="black", label="E[X]")
plt.xlabel("n")
plt.ylabel("Runing Mean")
plt.title("Running mean vs n")
plt.show()
In [23]:
# Cell 6 - How the estimator of E[X] changes with sample size
# Cereate many repetitions for different n and store the sample mean each time
nList = [5, 10, 20, 50, 100, 200, 500, 1000]
R = 500 # repetitions per n
rows = []
for n in nList:
# draw an R x n matrix samples
samples = rng.choice(values, size=(R,n), p=probs)
means = samples.mean(axis=1)
for m in means:
rows.append({"n": n, "mean_hat": m})
df = pd.DataFrame(rows)
df
Out[23]:
In [24]:
# Cell 7 - Boxplot
groups = [df.loc[df["n"] == n, "mean_hat"].values for n in nList]
plt.boxplot(groups, labels=[str(n) for n in nList])
plt.show()
In [25]:
# Cell 8 — Verify linear transform rules: Y = a + bX
a, b = -5, 3
Y = a + b * X
# Analytical results
EY = a + b * EX
VarY = (b**2) * VarX
SDY = np.sqrt(VarY)
# Empirical results
emp_EY = Y.mean()
emp_VarY = Y.var(ddof=0)
emp_SDY = np.sqrt(emp_VarY)
(EY, VarY, SDY), (emp_EY, emp_VarY, emp_SDY)
Out[25]:
In [26]:
# Cell 9 — Visual check: histograms of X and Y (discrete RVs)
plt.figure()
plt.hist(X, bins=20, density=True)
plt.xlabel("X")
plt.ylabel("Relative frequency")
plt.title("Histogram of discrete X")
plt.show()
plt.figure()
plt.hist(Y, bins=20, density=True)
plt.xlabel("Y = a + bX")
plt.ylabel("Relative frequency")
plt.title("Histogram of transformed variable Y")
plt.show()
In [31]:
# Cell 10 (optional) — Continuous RV example: Uniform(a,b)
# Demonstrate: P(c < X < d) as area, and mean/variance formulas (compare with simulation).
a_u, b_u = 2.0, 6.0
N_u = 200_000
Xu = rng.uniform(a_u, b_u, size=N_u)
# Analytical (Uniform)
EXu = (a_u + b_u) / 2
VarXu = (b_u - a_u)**2 / 12
# Empirical
(Xu.mean(), Xu.var(ddof=0)), (EXu, VarXu)
Out[31]:
In [32]:
# Cell 11 (optional) — Probability query via simulation: P(c < X < d)
c, d = 3.0, 4.5
prob_hat = np.mean((Xu > c) & (Xu < d))
# Analytical for uniform: (d-c)/(b-a) if [c,d] subset of [a,b]
prob_true = (d - c) / (b_u - a_u)
prob_hat, prob_true
Out[32]:
In [33]:
# Cell 12 (optional) — Empirical CDF plot (works for any sample)
# For teaching: CDF is monotone and ends at 1.
x_sorted = np.sort(Xu)
F_emp = np.arange(1, N_u + 1) / N_u
plt.figure()
plt.plot(x_sorted, F_emp, linewidth=1)
plt.xlabel("x")
plt.ylabel("F(x)")
plt.title("Empirical CDF from samples")
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [27]:
# Cell 4 — Monte Carlo sampling of X, empirical mean/variance
N = 50_000
X = rng.choice(values, size=N, p=probs)
emp_mean = X.mean()
emp_var = X.var(ddof=0) # population variance estimator
emp_sd = np.sqrt(emp_var)
emp_mean, emp_var, emp_sd
Out[27]:
In [28]:
# Cell 5 — Running mean (LLN intuition): \bar{X}_n vs n
running_mean = np.cumsum(X) / np.arange(1, N + 1)
plt.figure()
plt.plot(running_mean, linewidth=1)
plt.axhline(EX, linewidth=1) # theoretical mean
plt.xlabel("n")
plt.ylabel("Running mean")
plt.title("Running mean vs n (stabilizes near E[X])")
plt.show()
In [29]:
# Cell 6 — How the estimator of E[X] changes with sample size
# Create many repetitions for different n, store the sample mean each time.
n_list = [5, 10, 20, 50, 100, 200, 500, 1000]
R = 500 # repetitions per n (adjust for speed)
rows = []
for n in n_list:
# Draw an R x n matrix of samples
samples = rng.choice(values, size=(R, n), p=probs)
means = samples.mean(axis=1)
for m in means:
rows.append({"n": n, "mean_hat": m})
df_means = pd.DataFrame(rows)
df_means.head()
Out[29]:
In [30]:
# Cell 7 — Boxplots of \hat{E}[X] across sample sizes
# This visually shows variability decreasing as n increases.
groups = [df_means.loc[df_means["n"] == n, "mean_hat"].values for n in n_list]
plt.figure()
plt.boxplot(groups, labels=[str(n) for n in n_list], showfliers=False)
plt.axhline(EX, linewidth=1) # theoretical mean
plt.xlabel("Sample size n")
plt.ylabel("Sample mean (estimator of E[X])")
plt.title("Distribution of the sample mean vs sample size")
plt.show()