You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
187 KiB
187 KiB
None
<html lang="en">
<head>
</head>
</html>
In [1]:
# General used libriries
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import binom, poisson, norm, expon
Bernoulli distribution¶
In [2]:
# Bernoulli distribution:
p = 0.7
print("P(X=1) = ", p)
print("P(X=0) = ", 1-p)
print("Mean = ", p)
print("Variance = ", p*(1-p))
In [3]:
# Bernoulli observations
# repeating a success|failures experiment many times
p = 0.7
n = 1000 #samples
np.random.seed(33)
berData = np.random.binomial(1, p, size=n)
print("First 20 observations:")
print(berData[:20])
print("Sample mean: ", berData.mean())
print("Sample variance: ", berData.var(ddof=1))
In [4]:
values, counts = np.unique(berData, return_counts=True)
plt.bar(values, counts/n, width=0.4, color="grey")
plt.xticks([0, 1])
plt.xlabel("Outcomes")
plt.ylabel("Relative frequency")
plt.title("Simulated Bernoulli Distribution")
plt.show()
Binomial distribution¶
The binomial distribution counts the number of successes in $n$ independent Bernoulli trials.
In [5]:
p = 0.08
n = 20
k = 2
prob = binom.pmf(k, n, p)
print(f"P(X=2) = {prob:.4f}")
print("Mean= ", n*p)
print("Variance= ", n*p*(1-p))
In [6]:
x = np.arange(0, n+1)
pmf = binom.pmf(x, n, p)
plt.bar(x, pmf, color="grey", width=0.6)
plt.xlabel("Number of succeses")
plt.ylabel("Probability")
plt.title("Binomial distribution: n=20, p=0.08")
plt.show()
In [7]:
# Binomial simuation
n = 20 #sample
p = 0.08
nPop = 10_000
np.random.seed(33)
binomData = np.random.binomial(n,p,size=nPop)
print("Simulated mean: ", binomData.mean())
print("Theoretical mean: ", n*p)
print("--------------")
print("Simulated var: ", binomData.var(ddof=1))
print("Theoretical var: ", n*p*(1-p))
In [8]:
plt.hist(binomData, bins=np.arange(-0.5,n+1.5,1), density=True, color="grey")
plt.plot(x, pmf, 'ok')
plt.show()
In [9]:
print(pmf)
20*0.08*(1-0.08)**(19)
Out[9]:
Poisson distribution¶
- $P(X=k) = e^{-\lambda} \lambda^k/k!$
- $E[X] = \lambda$
- $Var(X) = \lambda$
Example: Let $X$ be the number of flaws per metre per cable, with average rate $\lambda=3$.
In [14]:
lam = 3
print("P(X=0) = ", poisson.pmf(0, lam))
print("P(X=2) = ", poisson.pmf(2, lam))
print("Mean = ", lam)
print("Var = ", lam)
In [22]:
x = np.arange(0, 15)
pmfPoisson = poisson.pmf(x, lam)
plt.bar(x, pmfPoisson, width=0.7)
plt.xlabel("Count")
plt.ylabel("Probability")
plt.title(r"Poisson distribution: $\lambda=3$")
plt.show()
In [33]:
# Poisson distribution
lam = 3
samples = 10_000
poiData = np.random.poisson(lam, size=samples)
print("Simulated mean: ", poiData.mean())
print("Theoretical mean: ", lam)
print("Simulated Var: ", poiData.var(ddof=1))
print("Theoretical Var: ", lam)
In [37]:
plt.hist(poiData, bins=np.arange(-0.5, 15.5,1), density=True)
plt.plot(x, poisson.pmf(x, lam),'or')
plt.xlabel("Counts")
plt.ylabel("Relative Freq/Probability")
plt.title("Poisson: Simulated vs Theory")
plt.show()
Normal distribution¶
If $X\sim N(\mu, \sigma^2)$, then:
- $\mu$ is the mean,
- $\sigma$ is the standard deviation
Example: Suppose a sensor output is a normally distributed with mean 50 and standard deviation 4. We compute the probability that the output is less than 58.
In [45]:
mu = 50
sigma = 4
proba = norm.cdf(58, loc=mu, scale=sigma)
proba
Out[45]:
In [47]:
x = np.linspace(mu - 4*sigma, mu + 4*sigma, 400)
pdf = norm.pdf(x, loc=mu, scale=sigma)
plt.figure(figsize=(8,4))
plt.plot(x, pdf)
plt.axvline(58, linestyle='--')
plt.xlabel("x")
plt.ylabel("Density")
plt.title("Normal Distribution")
plt.show()
In [51]:
mu = 50
sigma = 4
samples = 10_000
normData = np.random.normal(loc=mu, scale=sigma, size=samples)
print("Simulated mean: ", normData.mean())
print("Theoretical mean: ", mu)
print("Simulated std: ", normData.std(ddof=1))
print("Theoretical std: ", sigma)
In [58]:
plt.hist(normData, bins=50, density=True)
plt.plot(x, pdf, 'or')
plt.xlabel("Value")
plt.ylabel("Density")
plt.title("Normal: Simulation vs Theory")
plt.show()
Central Limit Theorem¶
The central limit theorem (CLT) states that the sampling distribution of the sample mean becomes approximately normal as the sample size increases.
In [61]:
population = np.random.exponential(scale=2, size=100_000)
plt.hist(population, bins=50, density=True)
plt.xlabel("Value")
plt.ylabel("Density")
plt.show()
In [ ]:
In [62]:
def SampleMeans(population, sampleSize, nRep=5_000):
means = []
for _ in range(nRep):
sample = np.random.choice(population, size=sampleSize, replace=True)
means.append(sample.mean())
pass
return np.array(means)
In [63]:
mean5 = SampleMeans(population, 5)
mean30 = SampleMeans(population, 30)
mean100 = SampleMeans(population, 100)
In [68]:
plt.hist(mean5, bins=40, density=True, color="orange", alpha=0.3)
plt.hist(mean30, bins=40, density=True, color="green", alpha=0.3)
plt.hist(mean100, bins=40, density=True, color="blue", alpha=0.3)
plt.show()
In [72]:
# Numerical Check
print("Population mean: ", population.mean())
print("Population var: ", population.var())
print("Mean of 5 samples: ", mean5.mean())
print("Var of 5 samples: ", mean5.var(ddof=1))
print("Mean of 30 samples: ", mean30.mean())
print("Var of 5 samples: ", mean30.var(ddof=1))
print("Mean of 100 samples: ", mean100.mean())