You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

187 KiB

Raw Permalink Blame History

None <html lang="en"> <head> </head>

In [1]:

# General used libriries
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import binom, poisson, norm, expon

Bernoulli distribution¶

In [2]:

# Bernoulli distribution:
p = 0.7
print("P(X=1) = ", p)
print("P(X=0) = ", 1-p)
print("Mean = ", p)
print("Variance = ", p*(1-p))

P(X=1) =  0.7
P(X=0) =  0.30000000000000004
Mean =  0.7
Variance =  0.21000000000000002

In [3]:

# Bernoulli observations 
# repeating a success|failures experiment many times
p = 0.7
n = 1000 #samples
np.random.seed(33)
berData = np.random.binomial(1, p, size=n)
print("First 20 observations:")
print(berData[:20])

print("Sample mean: ", berData.mean())
print("Sample variance: ", berData.var(ddof=1))

First 20 observations:
[1 1 1 1 0 1 1 0 1 1 0 1 1 1 1 0 0 1 1 1]
Sample mean:  0.675
Sample variance:  0.21959459459459454

In [4]:

values, counts = np.unique(berData, return_counts=True)
plt.bar(values, counts/n, width=0.4, color="grey")
plt.xticks([0, 1])
plt.xlabel("Outcomes")
plt.ylabel("Relative frequency")
plt.title("Simulated Bernoulli Distribution")
plt.show()

No description has been provided for this image

Binomial distribution¶

The binomial distribution counts the number of successes in $n$ independent Bernoulli trials.

In [5]:

p = 0.08
n = 20
k = 2

prob = binom.pmf(k, n, p)
print(f"P(X=2) = {prob:.4f}")
print("Mean= ", n*p)
print("Variance= ", n*p*(1-p))

P(X=2) = 0.2711
Mean=  1.6
Variance=  1.4720000000000002

In [6]:

x = np.arange(0, n+1)
pmf = binom.pmf(x, n, p)
plt.bar(x, pmf, color="grey", width=0.6)
plt.xlabel("Number of succeses")
plt.ylabel("Probability")
plt.title("Binomial distribution: n=20, p=0.08")
plt.show()

In [7]:

# Binomial simuation
n = 20 #sample
p = 0.08
nPop = 10_000
np.random.seed(33)
binomData = np.random.binomial(n,p,size=nPop)

print("Simulated mean: ", binomData.mean())
print("Theoretical mean: ", n*p)
print("--------------")
print("Simulated var: ", binomData.var(ddof=1))
print("Theoretical var: ", n*p*(1-p))

Simulated mean:  1.6011
Theoretical mean:  1.6
--------------
Simulated var:  1.4717259625962598
Theoretical var:  1.4720000000000002

In [8]:

plt.hist(binomData, bins=np.arange(-0.5,n+1.5,1), density=True, color="grey")
plt.plot(x, pmf, 'ok')
plt.show()

In [9]:

print(pmf)
20*0.08*(1-0.08)**(19)

[1.88693329e-01 3.28162312e-01 2.71090605e-01 1.41438577e-01
 5.22707783e-02 1.45449122e-02 3.16193744e-03 5.49902164e-04
 7.77035666e-05 9.00910917e-06 8.61740877e-07 6.81218085e-08
 4.44272664e-09 2.37737880e-10 1.03364296e-11 3.59527985e-13
 9.76978221e-15 1.99893242e-16 2.89700351e-18 2.65171946e-20
 1.15292150e-22]

Out[9]:

0.32816231158747255

Poisson distribution¶

$P(X=k) = e^{-\lambda} \lambda^k/k!$
$E[X] = \lambda$
$Var(X) = \lambda$

Example: Let $X$ be the number of flaws per metre per cable, with average rate $\lambda=3$.

In [14]:

lam = 3
print("P(X=0) = ", poisson.pmf(0, lam))
print("P(X=2) = ", poisson.pmf(2, lam))
print("Mean = ", lam)
print("Var = ", lam)

P(X=0) =  0.049787068367863944
P(X=2) =  0.22404180765538775
Mean =  3
Var =  3

In [22]:

x = np.arange(0, 15)
pmfPoisson = poisson.pmf(x, lam)

plt.bar(x, pmfPoisson, width=0.7)
plt.xlabel("Count")
plt.ylabel("Probability")
plt.title(r"Poisson distribution: $\lambda=3$")
plt.show()

In [33]:

# Poisson distribution
lam = 3
samples = 10_000
poiData = np.random.poisson(lam, size=samples)

print("Simulated mean: ", poiData.mean())
print("Theoretical mean: ", lam)

print("Simulated Var: ", poiData.var(ddof=1))
print("Theoretical Var: ", lam)

Simulated mean:  2.9953
Theoretical mean:  3
Simulated Var:  2.9753754475447542
Theoretical Var:  3

In [37]:

plt.hist(poiData, bins=np.arange(-0.5, 15.5,1), density=True)
plt.plot(x, poisson.pmf(x, lam),'or')
plt.xlabel("Counts")
plt.ylabel("Relative Freq/Probability")
plt.title("Poisson: Simulated vs Theory")
plt.show()

Normal distribution¶

If $X\sim N(\mu, \sigma^2)$, then:

$\mu$ is the mean,
$\sigma$ is the standard deviation

Example: Suppose a sensor output is a normally distributed with mean 50 and standard deviation 4. We compute the probability that the output is less than 58.

In [45]:

mu = 50 
sigma = 4
proba = norm.cdf(58, loc=mu, scale=sigma)
proba

Out[45]:

np.float64(0.9772498680518208)

In [47]:

x = np.linspace(mu - 4*sigma, mu + 4*sigma, 400)
pdf = norm.pdf(x, loc=mu, scale=sigma)

plt.figure(figsize=(8,4))
plt.plot(x, pdf)
plt.axvline(58, linestyle='--')
plt.xlabel("x")
plt.ylabel("Density")
plt.title("Normal Distribution")
plt.show()

In [51]:

mu = 50
sigma = 4 
samples = 10_000

normData = np.random.normal(loc=mu, scale=sigma, size=samples)
print("Simulated mean: ", normData.mean())
print("Theoretical mean: ", mu)
print("Simulated std: ", normData.std(ddof=1))
print("Theoretical std: ", sigma)

Simulated mean:  50.004935022325526
Theoretical mean:  50
Simulated std:  3.9780626491745017
Theoretical std:  4

In [58]:

plt.hist(normData, bins=50, density=True)
plt.plot(x, pdf, 'or')
plt.xlabel("Value")
plt.ylabel("Density")
plt.title("Normal: Simulation vs Theory")
plt.show()

Central Limit Theorem¶

The central limit theorem (CLT) states that the sampling distribution of the sample mean becomes approximately normal as the sample size increases.

In [61]:

population = np.random.exponential(scale=2, size=100_000)
plt.hist(population, bins=50, density=True)
plt.xlabel("Value")
plt.ylabel("Density")
plt.show()

In [ ]:

In [62]:

def SampleMeans(population, sampleSize, nRep=5_000):
    means = []
    for _ in range(nRep):
        sample = np.random.choice(population, size=sampleSize, replace=True)
        means.append(sample.mean())
        pass
    return np.array(means)

In [63]:

mean5 = SampleMeans(population, 5)
mean30 = SampleMeans(population, 30)
mean100 = SampleMeans(population, 100)

In [68]:

plt.hist(mean5, bins=40, density=True, color="orange",  alpha=0.3)
plt.hist(mean30, bins=40, density=True, color="green",  alpha=0.3)
plt.hist(mean100, bins=40, density=True, color="blue",  alpha=0.3)
plt.show()

In [72]:

# Numerical Check
print("Population mean: ", population.mean())
print("Population var: ", population.var())

print("Mean of 5 samples: ", mean5.mean())
print("Var of 5 samples: ", mean5.var(ddof=1))
print("Mean of 30 samples: ", mean30.mean())
print("Var of 5 samples: ", mean30.var(ddof=1))
print("Mean of 100 samples: ", mean100.mean())

Population mean:  1.9932073011758906
Population var:  4.025192782387372
Mean of 5 samples:  1.9998045611349438
Var of 5 samples:  0.7837744844351563
Mean of 30 samples:  1.992377755782367
Var of 5 samples:  0.13206867208626297
Mean of 100 samples:  1.994853961830046

</html>

187 KiB Raw Permalink Blame History