lab-sessions-26a

You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

181 KiB

Raw Blame History Unescape Escape

None <html lang="en"> <head> </head>

In [17]:

# Cell 1 - Imports and reproducibility
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

rng = np.random.default_rng(2026)

In [18]:

# Cell 2 - Define a discrete RV with PMF table
values = np.array([0, 1, 2]) # outcomes
probs = np.array([0.2, 0.5, 0.3]) # 

assert np.all(probs>=0), "Probabilities must be nonnegative"
assert np.isclose(probs.sum(), 1.0), "Probabilities must sum to 1"

$E[X]=\sum_x x\, p(x)$

$\text{Var}(X)=E[X^2]-\mu^2$

In [19]:

# Cell 3 - Anlytical E[X], Var(X)
EX = np.sum(values*probs)
EX2 = np.sum((values**2)*probs)
VarX = EX2-EX**2
SDX = np.sqrt(VarX)
EX, VarX, SDX

Out[19]:

(np.float64(1.1),
 np.float64(0.48999999999999977),
 np.float64(0.6999999999999998))

In [20]:

# Cell 4 - Monte Carlo Sampling of X
# Empirical mean and variance
N = 50_000
X = rng.choice(values, size=N, p=probs)
plt.plot(X[0:50], '.k')
plt.show()

eMean = X.mean()
eVar = X.var(ddof=0)
eSD = np.sqrt(eVar)
eMean, eVar, eSD

No description has been provided for this image

Out[20]:

(np.float64(1.0966),
 np.float64(0.4897884399999999),
 np.float64(0.6998488693996725))

In [21]:

X[0:9]

Out[21]:

array([0, 1, 1, 1, 1, 2, 2, 0, 1])

In [22]:

# Cell 5 - Running mean (LLN intuition):
# \bar{X}_n vs n
rMean = np.cumsum(X) / np.arange(1,N+1)

plt.plot(rMean, '.r', label="Running mean")
plt.grid()
plt.xlim([0,500])
plt.axhline(EX, color="black", label="E[X]")
plt.xlabel("n")
plt.ylabel("Runing Mean")
plt.title("Running mean vs n")
plt.show()

In [23]:

# Cell 6 - How the estimator of E[X] changes with sample size
# Cereate many repetitions for different n and store the sample mean each time
nList = [5, 10, 20, 50, 100, 200, 500, 1000]
R = 500 # repetitions per n

rows = []
for n in nList:
    # draw an R x n matrix samples
    samples = rng.choice(values, size=(R,n), p=probs)
    means = samples.mean(axis=1)
    for m in means:
        rows.append({"n": n, "mean_hat": m})

df = pd.DataFrame(rows)
df

Out[23]:

	n	mean_hat
0	5	1.400
1	5	1.200
2	5	1.400
3	5	1.200
4	5	0.600
...	...	...
3995	1000	1.033
3996	1000	1.132
3997	1000	1.089
3998	1000	1.129
3999	1000	1.121

4000 rows × 2 columns

In [24]:

# Cell 7 - Boxplot
groups = [df.loc[df["n"] == n, "mean_hat"].values for n in nList]
plt.boxplot(groups, labels=[str(n) for n in nList])

plt.show()

/var/folders/0c/2g7zyqyd3lj21c3yr8x3168m0000gn/T/ipykernel_42855/3468306116.py:3: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  plt.boxplot(groups, labels=[str(n) for n in nList])

In [25]:

# Cell 8 — Verify linear transform rules: Y = a + bX
a, b = -5, 3
Y = a + b * X

# Analytical results
EY = a + b * EX
VarY = (b**2) * VarX
SDY = np.sqrt(VarY)

# Empirical results
emp_EY = Y.mean()
emp_VarY = Y.var(ddof=0)
emp_SDY = np.sqrt(emp_VarY)

(EY, VarY, SDY), (emp_EY, emp_VarY, emp_SDY)

Out[25]:

((np.float64(-1.6999999999999997),
  np.float64(4.409999999999998),
  np.float64(2.0999999999999996)),
 (np.float64(-1.7102), np.float64(4.40809596), np.float64(2.099546608199018)))

In [26]:

# Cell 9 — Visual check: histograms of X and Y (discrete RVs)
plt.figure()
plt.hist(X, bins=20, density=True)
plt.xlabel("X")
plt.ylabel("Relative frequency")
plt.title("Histogram of discrete X")
plt.show()

plt.figure()
plt.hist(Y, bins=20, density=True)
plt.xlabel("Y = a + bX")
plt.ylabel("Relative frequency")
plt.title("Histogram of transformed variable Y")
plt.show()

In [31]:

# Cell 10 (optional) — Continuous RV example: Uniform(a,b)
# Demonstrate: P(c < X < d) as area, and mean/variance formulas (compare with simulation).
a_u, b_u = 2.0, 6.0
N_u = 200_000
Xu = rng.uniform(a_u, b_u, size=N_u)

# Analytical (Uniform)
EXu = (a_u + b_u) / 2
VarXu = (b_u - a_u)**2 / 12

# Empirical
(Xu.mean(), Xu.var(ddof=0)), (EXu, VarXu)

Out[31]:

((np.float64(4.004671683485947), np.float64(1.3301885811390868)),
 (4.0, 1.3333333333333333))

In [32]:

# Cell 11 (optional) — Probability query via simulation: P(c < X < d)
c, d = 3.0, 4.5
prob_hat = np.mean((Xu > c) & (Xu < d))

# Analytical for uniform: (d-c)/(b-a) if [c,d] subset of [a,b]
prob_true = (d - c) / (b_u - a_u)

prob_hat, prob_true

Out[32]:

(np.float64(0.375195), 0.375)

In [33]:

# Cell 12 (optional) — Empirical CDF plot (works for any sample)
# For teaching: CDF is monotone and ends at 1.
x_sorted = np.sort(Xu)
F_emp = np.arange(1, N_u + 1) / N_u

plt.figure()
plt.plot(x_sorted, F_emp, linewidth=1)
plt.xlabel("x")
plt.ylabel("F(x)")
plt.title("Empirical CDF from samples")
plt.show()

In [ ]:

In [27]:

# Cell 4 — Monte Carlo sampling of X, empirical mean/variance
N = 50_000
X = rng.choice(values, size=N, p=probs)

emp_mean = X.mean()
emp_var  = X.var(ddof=0)   # population variance estimator
emp_sd   = np.sqrt(emp_var)

emp_mean, emp_var, emp_sd

Out[27]:

(np.float64(1.09786), np.float64(0.4911234204), np.float64(0.7008019837300691))

In [28]:

# Cell 5 — Running mean (LLN intuition): \bar{X}_n vs n
running_mean = np.cumsum(X) / np.arange(1, N + 1)

plt.figure()
plt.plot(running_mean, linewidth=1)
plt.axhline(EX, linewidth=1)  # theoretical mean
plt.xlabel("n")
plt.ylabel("Running mean")
plt.title("Running mean vs n (stabilizes near E[X])")
plt.show()

In [29]:

# Cell 6 — How the estimator of E[X] changes with sample size
# Create many repetitions for different n, store the sample mean each time.
n_list = [5, 10, 20, 50, 100, 200, 500, 1000]
R = 500  # repetitions per n (adjust for speed)

rows = []
for n in n_list:
    # Draw an R x n matrix of samples
    samples = rng.choice(values, size=(R, n), p=probs)
    means = samples.mean(axis=1)
    for m in means:
        rows.append({"n": n, "mean_hat": m})

df_means = pd.DataFrame(rows)
df_means.head()

Out[29]:

	n	mean_hat
0	5	0.8
1	5	1.2
2	5	1.0
3	5	1.2
4	5	1.0

In [30]:

# Cell 7 — Boxplots of \hat{E}[X] across sample sizes
# This visually shows variability decreasing as n increases.
groups = [df_means.loc[df_means["n"] == n, "mean_hat"].values for n in n_list]

plt.figure()
plt.boxplot(groups, labels=[str(n) for n in n_list], showfliers=False)
plt.axhline(EX, linewidth=1)  # theoretical mean
plt.xlabel("Sample size n")
plt.ylabel("Sample mean (estimator of E[X])")
plt.title("Distribution of the sample mean vs sample size")
plt.show()

/var/folders/0c/2g7zyqyd3lj21c3yr8x3168m0000gn/T/ipykernel_42855/2654198260.py:6: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  plt.boxplot(groups, labels=[str(n) for n in n_list], showfliers=False)

</html>

181 KiB Raw Blame History Unescape Escape

181 KiB

Raw Blame History Unescape Escape