diff --git a/session-2-intuition/intuition.ipynb b/session-2-intuition/intuition.ipynb new file mode 100644 index 0000000..f54adc1 --- /dev/null +++ b/session-2-intuition/intuition.ipynb @@ -0,0 +1,758 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2392afbf", + "metadata": {}, + "source": [ + "# Week 2 Lab (Jupyter) — Descriptive Stats + Sampling Variability + Bootstrap\n", + "**Course focus:** Descriptive statistics (center/spread/shape) + population vs sample + sampling variability \n", + "**Lab focus:** Bootstrap intuition (mean vs median) + why estimates “move” + effect of sample size \\(n\\)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9c041bc5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0.00123015, 0.29874554, -0.27413786])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Cell 1 — Imports + settings\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "rng = np.random.default_rng(7)\n", + "rng.normal(size=3)" + ] + }, + { + "cell_type": "markdown", + "id": "cfba4554", + "metadata": {}, + "source": [ + "## Example 1 — Descriptive statistics (center, spread, shape)\n", + "We’ll compute: mean/median, SD/variance, IQR, five-number summary, and flag outliers using the 1.5×IQR rule." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "784c7898", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0 3.567780\n", + " 1 1.159428\n", + " 2 1.668679\n", + " 3 1.543439\n", + " 4 4.254155\n", + " Name: x, dtype: float64,\n", + " count 60.000000\n", + " mean 3.999189\n", + " std 5.105683\n", + " min 0.512643\n", + " 25% 1.657953\n", + " 50% 2.505819\n", + " 75% 4.258704\n", + " max 36.847101\n", + " Name: x, dtype: float64)" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Cell 2 — Build a dataset with skew + an outlier\n", + "n = 60\n", + "x = rng.lognormal(mean=1.0, sigma=0.6, size=n) # right-skewed\n", + "x[-1] *= 10 # inject outlier\n", + "x = pd.Series(x, name=\"x\")\n", + "\n", + "x.head(),x.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "45e74326", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "({'min': np.float64(0.5126428223213325),\n", + " 'Q1': np.float64(1.657952864334265),\n", + " 'median': np.float64(2.5058190123666924),\n", + " 'Q3': np.float64(4.258704400008809),\n", + " 'max': np.float64(36.84710076013428)},\n", + " np.float64(2.600751535674544),\n", + " (np.float64(-2.2431744391775514), np.float64(8.159831703520625)))" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Cell 3 — Five-number summary + IQR + outlier fences\n", + "q1 = x.quantile(0.25)\n", + "q3 = x.quantile(0.75)\n", + "iqr = q3 - q1\n", + "\n", + "lower_fence = q1 - 1.5 * iqr\n", + "upper_fence = q3 + 1.5 * iqr\n", + "\n", + "five_num = {\n", + " \"min\": x.min(),\n", + " \"Q1\": q1,\n", + " \"median\": x.median(),\n", + " \"Q3\": q3,\n", + " \"max\": x.max(),\n", + "}\n", + "five_num, iqr, (lower_fence, upper_fence)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "ace3d784", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "({'mean': np.float64(3.999188799394666),\n", + " 'median': np.float64(2.5058190123666924),\n", + " 'std': np.float64(5.105682866634222),\n", + " 'var': np.float64(26.067997534642245),\n", + " 'IQR': np.float64(2.600751535674544),\n", + " 'n_outliers': 4},\n", + " 13 13.336209\n", + " 25 11.128763\n", + " 45 13.372176\n", + " 59 36.847101\n", + " Name: x, dtype: float64)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Cell 4 — Identify outliers + summary of center/spread\n", + "outliers = x[(x < lower_fence) | (x > upper_fence)]\n", + "summary = {\n", + " \"mean\": x.mean(),\n", + " \"median\": x.median(),\n", + " \"std\": x.std(ddof=1),\n", + " \"var\": x.var(ddof=1),\n", + " \"IQR\": iqr,\n", + " \"n_outliers\": len(outliers),\n", + "}\n", + "summary, outliers" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "dc88300e", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Cell 5 — Visual diagnostics: histogram + boxplot\n", + "plt.figure()\n", + "plt.hist(x, bins=25)\n", + "plt.title(\"Histogram (skew + outlier)\")\n", + "plt.xlabel(\"x\"); plt.ylabel(\"count\")\n", + "plt.show()\n", + "\n", + "plt.figure()\n", + "plt.boxplot(x, vert=False, showmeans=True)\n", + "plt.title(\"Boxplot (shows outliers)\")\n", + "plt.ylabel(\"x\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "d350a5c9", + "metadata": {}, + "source": [ + "## Example 2 — Sampling variability: why estimates “move”\n", + "We create a *population* (known distribution), then repeatedly take random samples and compute the sample mean.\n", + "We compare how variability changes for different sample sizes \\(n\\)." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "f53b9065", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(np.float64(3.246809305332583), np.float64(2.1294506397257105))" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Cell 6 — Define a population (large synthetic population)\n", + "# (Treat this as the \"true\" population we are sampling from.)\n", + "population = rng.lognormal(mean=1.0, sigma=0.6, size=200_000)\n", + "pop_mu = population.mean()\n", + "pop_sd = population.std(ddof=0)\n", + "\n", + "pop_mu, pop_sd" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "9b895975", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(np.float64(0.6432811001722105),\n", + " np.float64(0.30330130600629723),\n", + " np.float64(0.14896614310477774))" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Cell 7 — Repeated sampling experiment\n", + "def repeated_sampling_means(pop, n, R=3000, rng=7):\n", + " means = np.empty(R)\n", + " for r in range(R):\n", + " sample = rng.choice(pop, size=n, replace=False)\n", + " means[r] = sample.mean()\n", + " return means\n", + "\n", + "R = 300\n", + "means_n10 = repeated_sampling_means(population, n=10, R=R, rng=rng)\n", + "means_n50 = repeated_sampling_means(population, n=50, R=R, rng=rng)\n", + "means_n200 = repeated_sampling_means(population, n=200, R=R, rng=rng)\n", + "means_n10\n", + "np.std(means_n10, ddof=1), np.std(means_n50, ddof=1), np.std(means_n200, ddof=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "4526ebf1", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Cell 8 — Plot sampling distributions of the mean for different n\n", + "bins = 10\n", + "plt.figure()\n", + "plt.hist(means_n10, bins=bins)\n", + "plt.axvline(pop_mu)\n", + "plt.title(\"Sampling distribution of mean (n=10)\")\n", + "plt.xlabel(\"sample mean\"); plt.ylabel(\"count\")\n", + "plt.show()\n", + "\n", + "plt.figure()\n", + "plt.hist(means_n50, bins=bins)\n", + "plt.axvline(pop_mu)\n", + "plt.title(\"Sampling distribution of mean (n=50)\")\n", + "plt.xlabel(\"sample mean\"); plt.ylabel(\"count\")\n", + "plt.show()\n", + "\n", + "plt.figure()\n", + "plt.hist(means_n200, bins=bins)\n", + "plt.axvline(pop_mu)\n", + "plt.title(\"Sampling distribution of mean (n=200)\")\n", + "plt.xlabel(\"sample mean\"); plt.ylabel(\"count\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "383c11d5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nSD of sample means (empirical SE)
0100.643281
1500.303301
22000.148966
\n", + "
" + ], + "text/plain": [ + " n SD of sample means (empirical SE)\n", + "0 10 0.643281\n", + "1 50 0.303301\n", + "2 200 0.148966" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Cell 9 — Quick table: variability vs n (empirical standard error)\n", + "se_table = pd.DataFrame({\n", + " \"n\": [10, 50, 200],\n", + " \"SD of sample means (empirical SE)\": [\n", + " np.std(means_n10, ddof=1),\n", + " np.std(means_n50, ddof=1),\n", + " np.std(means_n200, ddof=1),\n", + " ]\n", + "})\n", + "se_table" + ] + }, + { + "cell_type": "markdown", + "id": "6d9064f2", + "metadata": {}, + "source": [ + "## Example 3 — Bootstrap intuition: mean vs median (with an outlier)\n", + "Bootstrap = resample *with replacement* from the observed sample to approximate sampling variability." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31905223", + "metadata": {}, + "outputs": [], + "source": [ + "# Cell 10 — Bootstrap function\n", + "def bootstrap_statistic(x, stat_fn, B=5000, rng=None):\n", + " if rng is None:\n", + " rng = np.random.default_rng()\n", + " x = np.asarray(x)\n", + " n = len(x)\n", + " stats = np.empty(B, dtype=float)\n", + " for b in range(B):\n", + " sample = rng.choice(x, size=n, replace=True)\n", + " stats[b] = stat_fn(sample)\n", + " return stats\n", + "\n", + "def percentile_ci(samples, alpha=0.05):\n", + " lo = np.quantile(samples, alpha/2)\n", + " hi = np.quantile(samples, 1 - alpha/2)\n", + " return lo, hi" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "b583c4a1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
statpoint_estimatebootstrap_SDCI_95_loCI_95_hi
0mean3.9991890.6483652.9646895.471043
1median2.5058190.2950722.1037343.163472
\n", + "
" + ], + "text/plain": [ + " stat point_estimate bootstrap_SD CI_95_lo CI_95_hi\n", + "0 mean 3.999189 0.648365 2.964689 5.471043\n", + "1 median 2.505819 0.295072 2.103734 3.163472" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Cell 11 — Run bootstrap for mean and median on the observed sample x\n", + "B = 5000\n", + "boot_mean = bootstrap_statistic(x.values, np.mean, B=B, rng=rng)\n", + "boot_med = bootstrap_statistic(x.values, np.median, B=B, rng=rng)\n", + "\n", + "mean_ci = percentile_ci(boot_mean, alpha=0.05)\n", + "med_ci = percentile_ci(boot_med, alpha=0.05)\n", + "\n", + "pd.DataFrame({\n", + " \"stat\": [\"mean\", \"median\"],\n", + " \"point_estimate\": [x.mean(), x.median()],\n", + " \"bootstrap_SD\": [np.std(boot_mean, ddof=1), np.std(boot_med, ddof=1)],\n", + " \"CI_95_lo\": [mean_ci[0], med_ci[0]],\n", + " \"CI_95_hi\": [mean_ci[1], med_ci[1]],\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "87532376", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Cell 12 — Plot bootstrap distributions\n", + "plt.figure()\n", + "plt.hist(boot_mean, bins=40)\n", + "plt.title(\"Bootstrap distribution: mean\")\n", + "plt.xlabel(\"mean\"); plt.ylabel(\"count\")\n", + "plt.show()\n", + "\n", + "plt.figure()\n", + "plt.hist(boot_med, bins=40)\n", + "plt.title(\"Bootstrap distribution: median\")\n", + "plt.xlabel(\"median\"); plt.ylabel(\"count\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "822bf7bb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
casemeanmedianSD(boot mean)SD(boot median)
0with outlier3.9991892.5058190.6631120.294632
1no outlier3.4424452.4516800.3563880.291492
\n", + "
" + ], + "text/plain": [ + " case mean median SD(boot mean) SD(boot median)\n", + "0 with outlier 3.999189 2.505819 0.663112 0.294632\n", + "1 no outlier 3.442445 2.451680 0.356388 0.291492" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Cell 13 — Remove the outlier and compare stability\n", + "x_no = x.iloc[:-1] # drop the injected outlier\n", + "boot_mean_no = bootstrap_statistic(x_no.values, np.mean, B=B, rng=rng)\n", + "boot_med_no = bootstrap_statistic(x_no.values, np.median, B=B, rng=rng)\n", + "\n", + "comparison = pd.DataFrame({\n", + " \"case\": [\"with outlier\", \"no outlier\"],\n", + " \"mean\": [x.mean(), x_no.mean()],\n", + " \"median\": [x.median(), x_no.median()],\n", + " \"SD(boot mean)\": [np.std(boot_mean, ddof=1), np.std(boot_mean_no, ddof=1)],\n", + " \"SD(boot median)\": [np.std(boot_med, ddof=1), np.std(boot_med_no, ddof=1)],\n", + "})\n", + "comparison" + ] + }, + { + "cell_type": "markdown", + "id": "91229b9d", + "metadata": {}, + "source": [ + "# Student Task (deliverables)\n", + "### Submit a single notebook (.ipynb) with the following:\n", + "\n", + "## Task A — Descriptive Stats (10 pts)\n", + "1. Compute and report: mean, median, SD, IQR, five-number summary. \n", + "2. Plot: histogram + boxplot. \n", + "3. Identify outliers using the 1.5×IQR rule and print them.\n", + "\n", + "## Task B — Sampling Variability (10 pts)\n", + "1. Using the provided population experiment, run repeated sampling for **n = 10, 50, 200**. \n", + "2. Plot the sampling distributions (3 histograms). \n", + "3. Make a table of the empirical standard error (SD of sample means) vs n. \n", + "4. Write 3–4 sentences: **Why does variability decrease when n increases?**\n", + "\n", + "## Task C — Bootstrap Mean vs Median (10 pts)\n", + "1. Bootstrap the mean and median (B=5000) for the dataset with the outlier. \n", + "2. Plot both bootstrap distributions. \n", + "3. Compute 95% percentile CIs for mean and median. \n", + "4. Repeat after removing the outlier and compare:\n", + " - Which statistic changes more (mean or median)?\n", + " - Which bootstrap distribution is wider, and why?\n", + "\n", + "## Reflection (Bonus +2)\n", + "In one paragraph: “Big n does not fix systematic bias.” Give one real-world example." + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "1b309ab9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'TaskB_explanation': 'WRITE YOUR 3–4 SENTENCES HERE',\n", + " 'TaskC_comparison': 'WRITE YOUR COMPARISON HERE',\n", + " 'Bonus_reflection': 'OPTIONAL: WRITE YOUR PARAGRAPH HERE'}" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Cell 14 — Student: write answers here (replace with your text)\n", + "answers = {\n", + " \"TaskB_explanation\": \"WRITE YOUR 3–4 SENTENCES HERE\",\n", + " \"TaskC_comparison\": \"WRITE YOUR COMPARISON HERE\",\n", + " \"Bonus_reflection\": \"OPTIONAL: WRITE YOUR PARAGRAPH HERE\",\n", + "}\n", + "answers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f30b9ae8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv (3.14.2)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}