|
|
# Data exploration and visualization
|
|
|
|
|
|
|
|
|
```python
|
|
|
!pip3 install scikit-learn
|
|
|
```
|
|
|
|
|
|
Requirement already satisfied: scikit-learn in /Users/gmarx/lwc/courses/aia/lab-sessions-25b/.venv/lib/python3.13/site-packages (1.7.2)
|
|
|
Requirement already satisfied: numpy>=1.22.0 in /Users/gmarx/lwc/courses/aia/lab-sessions-25b/.venv/lib/python3.13/site-packages (from scikit-learn) (2.3.2)
|
|
|
Requirement already satisfied: scipy>=1.8.0 in /Users/gmarx/lwc/courses/aia/lab-sessions-25b/.venv/lib/python3.13/site-packages (from scikit-learn) (1.16.2)
|
|
|
Requirement already satisfied: joblib>=1.2.0 in /Users/gmarx/lwc/courses/aia/lab-sessions-25b/.venv/lib/python3.13/site-packages (from scikit-learn) (1.5.2)
|
|
|
Requirement already satisfied: threadpoolctl>=3.1.0 in /Users/gmarx/lwc/courses/aia/lab-sessions-25b/.venv/lib/python3.13/site-packages (from scikit-learn) (3.6.0)
|
|
|
|
|
|
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
|
|
|
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
|
|
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
from sklearn import datasets
|
|
|
iris = datasets.load_iris()
|
|
|
print(iris.DESCR)
|
|
|
```
|
|
|
|
|
|
.. _iris_dataset:
|
|
|
|
|
|
Iris plants dataset
|
|
|
--------------------
|
|
|
|
|
|
**Data Set Characteristics:**
|
|
|
|
|
|
:Number of Instances: 150 (50 in each of three classes)
|
|
|
:Number of Attributes: 4 numeric, predictive attributes and the class
|
|
|
:Attribute Information:
|
|
|
- sepal length in cm
|
|
|
- sepal width in cm
|
|
|
- petal length in cm
|
|
|
- petal width in cm
|
|
|
- class:
|
|
|
- Iris-Setosa
|
|
|
- Iris-Versicolour
|
|
|
- Iris-Virginica
|
|
|
|
|
|
:Summary Statistics:
|
|
|
|
|
|
============== ==== ==== ======= ===== ====================
|
|
|
Min Max Mean SD Class Correlation
|
|
|
============== ==== ==== ======= ===== ====================
|
|
|
sepal length: 4.3 7.9 5.84 0.83 0.7826
|
|
|
sepal width: 2.0 4.4 3.05 0.43 -0.4194
|
|
|
petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)
|
|
|
petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)
|
|
|
============== ==== ==== ======= ===== ====================
|
|
|
|
|
|
:Missing Attribute Values: None
|
|
|
:Class Distribution: 33.3% for each of 3 classes.
|
|
|
:Creator: R.A. Fisher
|
|
|
:Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
|
|
|
:Date: July, 1988
|
|
|
|
|
|
The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken
|
|
|
from Fisher's paper. Note that it's the same as in R, but not as in the UCI
|
|
|
Machine Learning Repository, which has two wrong data points.
|
|
|
|
|
|
This is perhaps the best known database to be found in the
|
|
|
pattern recognition literature. Fisher's paper is a classic in the field and
|
|
|
is referenced frequently to this day. (See Duda & Hart, for example.) The
|
|
|
data set contains 3 classes of 50 instances each, where each class refers to a
|
|
|
type of iris plant. One class is linearly separable from the other 2; the
|
|
|
latter are NOT linearly separable from each other.
|
|
|
|
|
|
.. dropdown:: References
|
|
|
|
|
|
- Fisher, R.A. "The use of multiple measurements in taxonomic problems"
|
|
|
Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to
|
|
|
Mathematical Statistics" (John Wiley, NY, 1950).
|
|
|
- Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.
|
|
|
(Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.
|
|
|
- Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
|
|
|
Structure and Classification Rule for Recognition in Partially Exposed
|
|
|
Environments". IEEE Transactions on Pattern Analysis and Machine
|
|
|
Intelligence, Vol. PAMI-2, No. 1, 67-71.
|
|
|
- Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions
|
|
|
on Information Theory, May 1972, 431-433.
|
|
|
- See also: 1988 MLC Proceedings, 54-64. Cheeseman et al"s AUTOCLASS II
|
|
|
conceptual clustering system finds 3 classes in the data.
|
|
|
- Many, many more ...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
import numpy as np
|
|
|
import matplotlib.pyplot as plt
|
|
|
sl = iris.data[:,0].reshape(-1,1)
|
|
|
sw = iris.data[:,1].reshape(-1,1)
|
|
|
plt.plot(sl, sw, 'ok')
|
|
|
plt.show()
|
|
|
sl.shape
|
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(150, 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
tg = iris.target
|
|
|
tg.shape
|
|
|
plt.plot(sl[tg==0,0], sw[tg==0,0], 'og', label="Seto")
|
|
|
plt.plot(sl[tg==1,0], sw[tg==1,0], 'or', label="Versi")
|
|
|
plt.plot(sl[tg==2,0], sw[tg==2,0], 'ob', label="Virgi")
|
|
|
plt.legend()
|
|
|
plt.show()
|
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|

|
|
|
|
|
|
|
|
|
|
|
|
# Binary classifier with one parameter
|
|
|
|
|
|
|
|
|
```python
|
|
|
z = np.linspace(-10, 10, 100)
|
|
|
sig = 1/(1+np.exp(-z-4)) + 1/(1+np.exp(-z+4))
|
|
|
plt.plot(z, sig, 'ob')
|
|
|
plt.show()
|
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|

|
|
|
|
|
|
|
|
|
|
|
|
# First classifier
|
|
|
$$z = \theta_1\times x_1 + \theta_0$$
|
|
|
|
|
|
|
|
|
```python
|
|
|
pw = iris.data[:, 3].reshape(-1,1)
|
|
|
X = np.c_[np.ones_like(pw), pw]
|
|
|
y = (iris.target==0).astype(int).reshape(-1,1) #Setosa
|
|
|
```
|
|
|
|
|
|
|
|
|
```python
|
|
|
def sigmoid(z):
|
|
|
#z = np.clip(z, -50, 50)
|
|
|
sig = 1/(1+np.exp(-z))
|
|
|
return sig
|
|
|
```
|
|
|
|
|
|
|
|
|
```python
|
|
|
def logLoss(y, yModel):
|
|
|
#yModel = np.clip(yModel, 1e-12, 1-1e-12)
|
|
|
loss = -np.mean(y*np.log(yModel)+(1-y)*np.log(1-yModel))
|
|
|
return loss
|
|
|
```
|
|
|
|
|
|
|
|
|
```python
|
|
|
# Gradient descent
|
|
|
lr = 0.1
|
|
|
epochs = 5000
|
|
|
m = X.shape[0]
|
|
|
np.random.seed(10)
|
|
|
theta = np.random.rand(2,1)
|
|
|
theta
|
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
array([[0.77132064],
|
|
|
[0.02075195]])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
xNew = np.linspace(-1,3, m)
|
|
|
Xnew = np.c_[np.ones_like(xNew), xNew]
|
|
|
losses = []
|
|
|
|
|
|
for i in range(epochs):
|
|
|
z = X@theta
|
|
|
h = sigmoid(z)
|
|
|
grad = (X.T@(h-y))/m
|
|
|
theta = theta - lr*grad
|
|
|
lossValue = logLoss(y, h)
|
|
|
losses.append(lossValue)
|
|
|
if(i%100==0):
|
|
|
print(f"Epoch {i:4d}, Loss: {lossValue:.6f}")
|
|
|
theta
|
|
|
```
|
|
|
|
|
|
Epoch 0, Loss: 0.909705
|
|
|
Epoch 100, Loss: 0.262854
|
|
|
Epoch 200, Loss: 0.194549
|
|
|
Epoch 300, Loss: 0.154778
|
|
|
Epoch 400, Loss: 0.128995
|
|
|
Epoch 500, Loss: 0.110967
|
|
|
Epoch 600, Loss: 0.097650
|
|
|
Epoch 700, Loss: 0.087403
|
|
|
Epoch 800, Loss: 0.079264
|
|
|
Epoch 900, Loss: 0.072636
|
|
|
Epoch 1000, Loss: 0.067129
|
|
|
Epoch 1100, Loss: 0.062475
|
|
|
Epoch 1200, Loss: 0.058488
|
|
|
Epoch 1300, Loss: 0.055030
|
|
|
Epoch 1400, Loss: 0.052002
|
|
|
Epoch 1500, Loss: 0.049325
|
|
|
Epoch 1600, Loss: 0.046941
|
|
|
Epoch 1700, Loss: 0.044803
|
|
|
Epoch 1800, Loss: 0.042874
|
|
|
Epoch 1900, Loss: 0.041124
|
|
|
Epoch 2000, Loss: 0.039528
|
|
|
Epoch 2100, Loss: 0.038066
|
|
|
Epoch 2200, Loss: 0.036723
|
|
|
Epoch 2300, Loss: 0.035482
|
|
|
Epoch 2400, Loss: 0.034334
|
|
|
Epoch 2500, Loss: 0.033267
|
|
|
Epoch 2600, Loss: 0.032273
|
|
|
Epoch 2700, Loss: 0.031345
|
|
|
Epoch 2800, Loss: 0.030475
|
|
|
Epoch 2900, Loss: 0.029660
|
|
|
Epoch 3000, Loss: 0.028892
|
|
|
Epoch 3100, Loss: 0.028169
|
|
|
Epoch 3200, Loss: 0.027486
|
|
|
Epoch 3300, Loss: 0.026840
|
|
|
Epoch 3400, Loss: 0.026227
|
|
|
Epoch 3500, Loss: 0.025647
|
|
|
Epoch 3600, Loss: 0.025094
|
|
|
Epoch 3700, Loss: 0.024569
|
|
|
Epoch 3800, Loss: 0.024068
|
|
|
Epoch 3900, Loss: 0.023591
|
|
|
Epoch 4000, Loss: 0.023134
|
|
|
Epoch 4100, Loss: 0.022698
|
|
|
Epoch 4200, Loss: 0.022280
|
|
|
Epoch 4300, Loss: 0.021879
|
|
|
Epoch 4400, Loss: 0.021495
|
|
|
Epoch 4500, Loss: 0.021126
|
|
|
Epoch 4600, Loss: 0.020771
|
|
|
Epoch 4700, Loss: 0.020430
|
|
|
Epoch 4800, Loss: 0.020102
|
|
|
Epoch 4900, Loss: 0.019785
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
array([[ 5.73789762],
|
|
|
[-7.93887721]])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
plt.plot(losses)
|
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
[<matplotlib.lines.Line2D at 0x11cfb3750>]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
xNew = np.linspace(-0.5,3, m)
|
|
|
Xnew = np.c_[np.ones_like(xNew), xNew]
|
|
|
yMod = sigmoid(Xnew@theta)
|
|
|
yJitter = y+np.random.uniform(-0.1, 0.1, size=y.shape)
|
|
|
logloss = logLoss(y, sigmoid(X@theta))
|
|
|
print(logloss)
|
|
|
```
|
|
|
|
|
|
0.019479899336526857
|
|
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
plt.plot(pw, yJitter, 'og', alpha=0.3)
|
|
|
plt.plot(xNew, yMod, ':r')
|
|
|
plt.show()
|
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
p_train = sigmoid(X @ theta)
|
|
|
y_hat = (p_train >= 0.5).astype(int) # 0.5 is default; tune if needed
|
|
|
acc = (y_hat == y).mean()
|
|
|
print(f"Train accuracy: {acc:.3f}")
|
|
|
```
|
|
|
|
|
|
Train accuracy: 1.000
|
|
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
|
```
|