3. UNIT 2. Statistical Learning#

This Unit includes main introduction to statistical learning, strongly based in [1].

Table of contents

  • UNIT 2. Statistical Learning

    • Evaluation of the loss function in K-means

    • Polynomial regression

      • Training data with fitted curves fordifferent \(p\)

      • Test loss as function of the number of parameters \(p\) of the model.

      • Cross-validation

3.1. Evaluation of the loss function in K-means#

K-means is an example of unsupervised learning method. We will use this method along with scipy in order to evaluate the loss function calculated for a given dataset.

We will first import some data using numpy.

import numpy as np
Xmat = np.genfromtxt('datasets/clusterdata.csv', delimiter=',')
n, D = Xmat.shape
n,D
(300, 2)

The first thing to do is always a visualization of the data

import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax.scatter(Xmat[:,0],Xmat[:,1])
ax.grid(True)
plt.savefig("../figures/data.png")
plt.show()
../_images/e2014dafb788741112495b5dcc0548532974de4803c81588a51656c78d00a53a.png

First, we will run a K-means using scikit-learn, a high level wrapper for machine learning with python

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3,random_state=0,n_init="auto").fit(Xmat)
kmeans.labels_
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0,
       2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2,
       0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2,
       2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 0, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2], dtype=int32)
centers=kmeans.cluster_centers_
centers
array([[ 0.6177812 , -1.29975072],
       [-3.86806187,  0.04564101],
       [-1.84791126, -3.02916471]])

Now we plot again the clusters with colors for each label

labels=kmeans.labels_

cluster_name = ["Cluster"+str(i) for i in set(labels)]

fig, ax = plt.subplots()
ax.scatter(Xmat[:,0],Xmat[:,1],c=labels)
for i, txt in enumerate(cluster_name): # hello
    ax.text(centers[i,0],centers[i,1],s=txt,ha="center",va="center")
ax.grid(True)
plt.show()
../_images/5dbc2427ae342730d9458078efd9ab0d5b5e0493a35df0d2213634c0a73d261a.png

with seaborn we can also find a nice plot in a faster way

import seaborn as sns
sns.scatterplot(x=Xmat[:,0],y=Xmat[:,1],hue=labels,legend='full',palette="Set1")
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[6], line 1
----> 1 import seaborn as sns
      2 sns.scatterplot(x=Xmat[:,0],y=Xmat[:,1],hue=labels,legend='full',palette="Set1")

ModuleNotFoundError: No module named 'seaborn'

We will now run it in a more manual and detailed way, without the use of scikit-learn We start by initializing the centers.

K = 3
c  = np.array([[-2.0,-4,0],[-3,1,-1]])  #initialize centers
cold = np.zeros(c.shape)
dist2 = np.zeros((K,n))
#for h in range(0,100):
while np.abs(c - cold).sum() > 0.001:
     print(c)
     cold = c.copy()
     for i in range(0,K): #compute the squared distances
          dist2[i,:] = np.sum((Xmat - c[:,i].T)**2, 1)
        
     label = np.argmin(dist2,0) #assign the points to nearest centroid
     minvals = np.amin(dist2,0)
     for i in range(0,K): # recompute the centroids
          c[:,i] = np.mean(Xmat[np.where(label == i),:], 1).reshape(1,2)

print('Loss = {:3.3f}'.format(minvals.mean()))
print(minvals)
[[-2. -4.  0.]
 [-3.  1. -1.]]
[[-1.97122735 -3.99500505  0.48593068]
 [-3.01536786  0.0589819  -1.24373331]]
[[-1.92700937 -3.94414519  0.56115053]
 [-3.01308366  0.02001296 -1.29804366]]
[[-1.9285638  -3.92373524  0.56115053]
 [-3.04155556  0.01309036 -1.29804366]]
Loss = 2.288
[1.10370659e+00 3.16794127e+00 9.26321472e-01 6.36198076e+00
 8.19130169e-02 7.65221873e-01 1.90343757e+00 3.87433330e-01
 3.42126461e+00 3.95242456e-02 1.28746860e+00 2.61470537e+00
 6.60069131e-01 9.83476613e-01 5.00950821e+00 9.27041473e+00
 9.08533546e-02 6.56955782e-01 8.42663900e-01 5.42274818e+00
 3.28259714e-01 1.39524393e+00 9.30193801e+00 4.87563968e-01
 1.39388416e-01 2.88914157e+00 1.68884217e-01 2.76846935e-01
 3.60211197e-01 1.99402612e+00 6.37654752e-01 5.54788080e-01
 7.14620966e-01 3.98814235e+00 2.64597189e+00 4.12086589e+00
 4.06560394e+00 6.19786157e+00 1.95279352e-01 5.87981054e+00
 7.81544253e-01 1.89578819e-02 1.58114767e+01 1.59106964e+00
 1.24970759e+00 1.54724188e+00 5.80169437e-01 2.79809498e+00
 2.58827718e+00 1.56788245e-01 6.29450307e+00 7.55126173e-01
 2.11125454e+00 3.16687274e+00 3.17499809e+00 2.29500150e+00
 7.19450650e-01 1.70127421e-01 2.04775355e+00 1.18756618e+00
 5.94045269e+00 2.65376419e+00 1.33189853e+00 2.61497243e+00
 2.50971365e+00 1.15383239e+01 1.23152835e+00 2.27242889e-01
 1.51577676e+00 2.44436812e+00 2.52548096e-01 1.00716864e+00
 1.54302396e+00 4.87459232e+00 4.82658821e+00 3.33043111e-01
 2.20382717e+00 6.63840451e-01 3.62129345e-01 1.75664599e+00
 9.94602417e-01 2.62081394e-01 6.52313073e-01 1.08904875e+00
 3.20499488e+00 1.16366830e+01 8.61817141e+00 2.46213616e-01
 4.80191816e-01 1.97584570e+00 7.95787481e-01 9.66485293e+00
 6.23044138e+00 1.21366725e+00 5.31963203e-01 7.20398700e+00
 8.03798887e+00 3.08059107e+00 1.13250857e-01 4.26708629e+00
 1.26435074e+00 3.91000069e+00 3.34324056e+00 5.62382386e-01
 1.60412838e+00 1.39746630e-01 2.91118234e+00 5.60292242e+00
 2.30189623e+00 3.81623770e+00 6.09244632e+00 1.66161109e-01
 1.58651762e+00 5.20660068e-01 1.28762722e+00 3.96034709e+00
 5.44074737e+00 2.82918501e-01 4.94435099e+00 1.18292865e-01
 2.03787792e+00 1.16602956e+00 2.65935575e+00 7.76241771e-01
 5.08711032e-01 6.55672696e+00 1.62636781e+01 4.49737124e-01
 6.08708861e+00 2.15050345e+00 3.86225757e+00 2.25852470e+00
 7.57486935e-01 2.64345611e+00 3.47281604e-01 1.73673166e-01
 2.56056275e+00 1.09820137e+01 6.87173401e+00 3.24082157e+00
 7.27456514e+00 4.61362079e+00 2.53893181e+00 1.44035262e+00
 4.29147431e+00 5.21623035e-01 4.11675258e+00 4.99996659e+00
 5.02390888e+00 4.13191182e+00 7.32940303e-01 1.23456152e+00
 2.30590112e-01 2.40346630e-01 1.71944739e+00 1.42420158e-01
 3.87376424e-01 2.64658243e+00 9.49783995e-01 5.79860454e-01
 1.42771516e+01 5.15342382e-01 3.11544396e+00 4.71103811e+00
 1.68527721e+00 1.01124027e+00 2.71735334e+00 5.01697014e+00
 1.20401047e+00 1.22828133e+00 3.79904667e+00 3.10943027e+00
 4.60659922e+00 2.18761033e+00 1.46187638e+00 5.74747581e+00
 4.62576799e-01 5.51874621e+00 2.13933425e+00 1.19274129e+00
 1.36760292e+00 4.43987948e+00 2.72913321e-02 1.18239043e+00
 2.59255662e+00 1.67005336e+00 6.15207790e-01 4.22885589e+00
 1.09236120e+00 2.62886596e+00 1.96367462e-01 1.66335775e+00
 1.05018702e+00 2.30007270e+00 4.71160544e-01 6.90490130e+00
 3.06360029e-02 4.17291823e+00 1.78829118e+00 1.66644606e+00
 3.32877806e-01 1.83417521e+00 2.39280214e+00 3.54104248e-02
 9.71440745e-01 8.57466028e-02 3.01524403e-01 2.51650185e+00
 9.00230729e-01 2.04091281e+00 1.43003044e+00 1.05355761e-01
 8.33502890e-01 6.75307367e-03 9.96668028e-01 1.54330680e+00
 6.42110498e-02 3.03354967e+00 5.18414108e-01 2.94461794e+00
 8.83137151e-01 6.18140374e+00 3.97633793e-02 8.59633916e-01
 7.29692942e-01 2.90559003e+00 3.93359104e-02 2.59904337e+00
 3.97286989e-01 3.02875510e+00 2.21762720e+00 1.48792678e+00
 1.02653170e+00 1.30353944e+00 2.96678597e+00 2.08537385e-01
 2.20328300e+00 7.82104486e-01 1.60013270e-02 3.90905913e+00
 6.53113989e-01 5.61312184e-01 3.13260955e+00 2.47656289e+00
 2.52067057e-01 3.60714597e+00 1.59703074e-01 9.01337381e-01
 3.91650593e-01 1.36566149e+00 7.61761529e-01 8.63626265e-01
 3.41509664e+00 1.64479911e-01 4.42063882e-01 4.43310734e+00
 3.11082936e-01 1.39950091e+00 7.18360934e-01 3.25502334e+00
 7.57171073e+00 1.64074023e-01 2.58883476e-01 5.22163437e-01
 3.37382868e-01 3.55945840e+00 2.68774527e+00 7.96441535e-01
 2.04766652e+00 4.56350731e-01 2.04906580e+00 3.47565354e+00
 3.18512410e-01 1.42210012e-01 7.66775883e+00 3.30355289e-01
 3.95448763e-01 2.91801964e+00 1.56676421e-01 7.66083351e-01
 3.04779263e+00 2.05917343e+00 2.73682145e+00 2.18834246e+00
 1.98642053e+00 1.25553813e+00 5.73958598e-01 4.34455361e-01
 8.30947169e-01 9.72857253e-03 1.96138433e+00 6.22493856e-02
 5.42555071e-01 5.55735339e+00 1.55951686e+00 1.87152470e-01
 6.70689369e-02 2.77515675e-01 1.50853483e+00 6.17071685e-01]

3.2. Polynomial regression#

Let us generate data drawn from iid random points that have a normal distribution with expectation \(10 − 140ui + 400u^2i − 250u^3\) and variance 25. This is an example of a polynomial regression model. Using a squared-error loss, the optimal prediction function \(h^*(u) = E[Y | U = u]\) is thus

\[h^*(u) = 10 − 140^u + 400u^2 − 250u^3\]
import numpy as np
from numpy.random import rand , randn

import matplotlib.pyplot as plt

def generate_data(beta , sig, n):  # generate data with the requested variance given by sig**2
   u = np.random.rand(n, 1)
   y = (u ** np.arange(0, 4)) @ beta + sig * np.random.randn(n, 1) # we use @ for matrix multiplication
   return u, y

np.random.seed(12)
beta = np.array([[10, -140, 400, -250]]).T
n = 100
sig = 5
u, y = generate_data(beta , sig, n)

# define the smooth line
xx = np.arange(np.min(u), np.max(u)+5e-3, 5e-3)
yy = np.polyval(np.flip(beta), xx)     # evaluate the polynomial

plt.plot(u, y, '.', markersize=8)
plt.plot(xx, yy, '--',linewidth=3)

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
plt.xlabel(r'$u$')
plt.ylabel(r'$h^*(u)$')
plt.legend(['data points','true'])
plt.savefig('../figures/polydatpy.pdf',format='pdf')
plt.show()
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
File ~/miniconda3/lib/python3.11/site-packages/numpy/core/__init__.py:23
     22 try:
---> 23     from . import multiarray
     24 except ImportError as exc:

File ~/miniconda3/lib/python3.11/site-packages/numpy/core/multiarray.py:10
      9 import functools
---> 10 from . import overrides
     11 from . import _multiarray_umath

File ~/miniconda3/lib/python3.11/site-packages/numpy/core/overrides.py:6
      4 import os
----> 6 from numpy.core._multiarray_umath import (
      7     add_docstring, implement_array_function, _get_implementing_args)
      8 from numpy.compat._inspect import getargspec

ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
/home/jordivilla/GitHub/Classes/Data-Science-with-Python/code/UNIT2-Statistical-Learning.ipynb Cell 16 line 1
----> <a href='vscode-notebook-cell:/home/jordivilla/GitHub/Classes/Data-Science-with-Python/code/UNIT2-Statistical-Learning.ipynb#X21sZmlsZQ%3D%3D?line=0'>1</a> import numpy as np
      <a href='vscode-notebook-cell:/home/jordivilla/GitHub/Classes/Data-Science-with-Python/code/UNIT2-Statistical-Learning.ipynb#X21sZmlsZQ%3D%3D?line=1'>2</a> from numpy.random import rand , randn
      <a href='vscode-notebook-cell:/home/jordivilla/GitHub/Classes/Data-Science-with-Python/code/UNIT2-Statistical-Learning.ipynb#X21sZmlsZQ%3D%3D?line=3'>4</a> import matplotlib.pyplot as plt

File ~/miniconda3/lib/python3.11/site-packages/numpy/__init__.py:141
    138 # Allow distributors to run custom init code
    139 from . import _distributor_init
--> 141 from . import core
    142 from .core import *
    143 from . import compat

File ~/miniconda3/lib/python3.11/site-packages/numpy/core/__init__.py:49
     25     import sys
     26     msg = """
     27 
     28 IMPORTANT: PLEASE READ THIS FOR ADVICE ON HOW TO SOLVE THIS ISSUE!
   (...)
     47 """ % (sys.version_info[0], sys.version_info[1], sys.executable,
     48         __version__, exc)
---> 49     raise ImportError(msg)
     50 finally:
     51     for envkey in env_added:

ImportError: 

IMPORTANT: PLEASE READ THIS FOR ADVICE ON HOW TO SOLVE THIS ISSUE!

Importing the numpy C-extensions failed. This error can happen for
many reasons, often due to issues with your setup or how NumPy was
installed.

We have compiled some common reasons and troubleshooting tips at:

    https://numpy.org/devdocs/user/troubleshooting-importerror.html

Please note and check the following:

  * The Python version is: Python3.11 from "/home/jordivilla/miniconda3/bin/python"
  * The NumPy version is: "1.24.3"

and make sure that they are the versions you expect.
Please carefully study the documentation linked above for further help.

Original error was: libopenblas.so.0: cannot open shared object file: No such file or directory

3.2.1. Training data with fitted curves fordifferent \(p\)#

We will check the models for \(p = 2, 4, 16\). The true cubic polynomial curve for p = 4 is also plotted (dashed line).

from numpy.linalg import norm , solve

max_p = 18
p_range = np.arange(1, max_p + 1, 1)
X = np.ones((n, 1))
betahat, trainloss = {}, {}

for p in p_range:  # p is the number of parameters
    if p > 1:
        X = np.hstack((X, u**(p-1)))  # add column to matrix

    betahat[p] = solve(X.T @ X, X.T @ y)
    trainloss[p] = (norm(y - X @ betahat[p])**2/n)

p = [2, 4, 16]  # select three curves

#replot the points and true line and store in the list "plots"
plots = [plt.plot(u, y, 'k.', markersize=8)[0], 
         plt.plot(xx, yy, 'k--',linewidth=3)[0]]
# add the three curves
for i in p:
    yy = np.polyval(np.flip(betahat[i]), xx)
    plots.append(plt.plot(xx, yy)[0])

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
plt.xlabel(r'$u$')
plt.ylabel(r'$h^{\mathcal{H}_p}_{\tau}(u)$')
plt.legend(plots,('data points', 'true','$p=2$, underfit',
                  '$p=4$, correct','$p=16$, overfit','d'))
plt.savefig('../figures/polyfitpy.pdf',format='pdf')
plt.show()
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
/tmp/ipykernel_63313/2927019202.py:12: RuntimeWarning: invalid value encountered in matmul
  betahat[p] = solve(X.T @ X, X.T @ y)
../_images/fb3cf6d107e10eb9ea83814d89120a94535583b2d9a25ac4aaf8de1a4f311e35.png

3.2.2. Test loss as function of the number of parameters \(p\) of the model.#

# generate test data
u_test, y_test = generate_data(beta, sig, n)

MSE = []
X_test = np.ones((n, 1))

for p in p_range:
    if p > 1:
        X_test = np.hstack((X_test, u_test**(p-1)))

    y_hat = X_test @ betahat[p]  # predictions
    MSE.append(np.sum((y_test - y_hat)**2/n))   # calculate the mean square error

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
plt.plot(p_range, MSE, 'b', p_range, MSE, 'bo')
plt.xticks(ticks=p_range)
plt.xlabel('Number of parameters $p$')
plt.ylabel('Test loss')
plt.tight_layout()
plt.savefig('../figures/MSEpy.pdf',format='pdf')
plt.show()
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
../_images/b0965381fcc2c5bdbe24b4e4233e6efb9fc726825d7e9b551ba51e309627e8ed.png

3.2.3. Cross-validation#

For the polynomial regression example, we can calculate a \(K\)-fold cross-validation loss with a nonrandom partitioning of the training set using the following code.

K_vals = [5, 10, 100]  # number of folds
cv = np.zeros((len(K_vals), max_p))
X = np.ones((n, 1))

for p in p_range:
    if p > 1:
        X = np.hstack((X, u**(p-1)))
    j = 0
    for K in K_vals:
        loss = []
        for k in range(1, K+1):
            # integer indices of test samples
            test_ind = ((n/K)*(k-1) + np.arange(1, n/K + 1) - 1).astype('int')
            train_ind = np.setdiff1d(np.arange(n), test_ind)

            X_train, y_train = X[train_ind, :], y[train_ind, :]
            X_test, y_test = X[test_ind, :], y[test_ind]

            # fit model and evaluate test loss
            betahat = solve(X_train.T @ X_train, X_train.T @ y_train)
            loss.append(norm(y_test - X_test @ betahat) ** 2)

        cv[j, p-1] = sum(loss) / n
        j += 1

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
p1 = plt.plot(p_range, cv[0, :], 'k-.', p_range, cv[0, :], 'k.', markersize=10)[0]
p2 = plt.plot(p_range, cv[1, :], 'r', p_range, cv[1, :], 'r.', markersize=10)[0]
p3 = plt.plot(p_range, cv[2, :], 'b--', p_range, cv[2, :], 'b.', markersize=10)[0]
plt.xticks(range(2, 19, 2))
plt.xlabel('Number of parameters $p$')
plt.ylabel('$K$-fold cross-validation loss')
plt.legend((p1,p2,p3),('$K$=5','$K$=10','$K$=100'))
plt.tight_layout()
plt.savefig('../figures/crossvalpy.pdf',format='pdf')
plt.show()
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
/tmp/ipykernel_63313/2728808787.py:20: RuntimeWarning: invalid value encountered in matmul
  betahat = solve(X_train.T @ X_train, X_train.T @ y_train)
../_images/e2b13979a9509283b329db32d96fd1162dac6c574d87e43ea4fea836fe739f50.png
[1]

Dirk P. Kroese, Zdravko Botev, Thomas Taimre, and Radislav: Vaisman. Data Science and Machine Learning: Mathematical and Statistical Methods. Machine Learning & Pattern Recognition. Chapman & Hall/CRC, 2020. URL: https://acems.org.au/data-science-machine-learning-book-available-download (visited on 2023-08-15).