py4sci

Table Of Contents

Previous topic

Decision trees

Next topic

Nearest neighbour classifier

This Page

Discriminant analysis

This example applies LDA and QDA to the iris data. While it is simple to fit LDA and QDA, the plots used to show the decision boundaries where plotted with python rather than R using the snippet of code we saw in the tree example.

    iris = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data",
        sep = ",", header = FALSE)
    names(iris) = c("sepal.length", "sepal.width", "petal.length", "petal.width",
        "iris.type")
    attach(iris)

Here is an example of LDA using only petal.width and petal.length

    library(MASS)
    iris.lda = lda(iris.type ~ petal.length + petal.width, data = iris)
    iris.lda

    ## Call:
    ## lda(iris.type ~ petal.length + petal.width, data = iris)
    ##
    ## Prior probabilities of groups:
    ##     Iris-setosa Iris-versicolor  Iris-virginica
    ##          0.3333          0.3333          0.3333
    ##
    ## Group means:
    ##                 petal.length petal.width
    ## Iris-setosa            1.464       0.244
    ## Iris-versicolor        4.260       1.326
    ## Iris-virginica         5.552       2.026
    ##
    ## Coefficients of linear discriminants:
    ##                LD1    LD2
    ## petal.length 1.544 -2.157
    ## petal.width  2.403  5.024
    ##
    ## Proportion of trace:
    ##    LD1    LD2
    ## 0.9948 0.0052

    plot(iris.lda)

_images/lda_fig_00.png

In[5]:

%load_ext rmagic
%R -d iris
from matplotlib import pyplot as plt, mlab, pylab
import numpy as  np
col = {1:'r', 2:'y', 3:'g'}

In[6]:

grid = np.mgrid[-0.5:3.5:500j,0:8:400j]
gridT = grid.reshape((2,-1)).T
gridT.shape

Out[6]:

(200000, 2)

In[7]:

%%R -i gridT -o labels -d iris
colnames(gridT) = c('petal.width', 'petal.length')
gridT = data.frame(gridT)
labels = predict(iris.lda, gridT)$class

In[8]:

plt.imshow(labels.reshape((500,400)), interpolation='nearest', origin='lower', alpha=0.4, extent=[0,8,-0.5,3], cmap=pylab.cm.RdYlGn)
plt.scatter(iris['petal.length'], iris['petal.width'], c=[col[t] for t in iris['iris.type']])
a = plt.gca()
a.set_xlim((0,8))
a.set_ylim((-0.5,3))

Out[8]:

(-0.5, 3)
_images/lda_fig_01.png
<matplotlib.figure.Figure at 0x67cf490>

If we want to have a nice visualization of the boundary, we can use any two pairs of variables. Here is one using sepal.length and sepal.width.

    iris.lda = lda(iris.type ~ sepal.length + sepal.width, data = iris)
    iris.lda

    ## Call:
    ## lda(iris.type ~ sepal.length + sepal.width, data = iris)
    ##
    ## Prior probabilities of groups:
    ##     Iris-setosa Iris-versicolor  Iris-virginica
    ##          0.3333          0.3333          0.3333
    ##
    ## Group means:
    ##                 sepal.length sepal.width
    ## Iris-setosa            5.006       3.418
    ## Iris-versicolor        5.936       2.770
    ## Iris-virginica         6.588       2.974
    ##
    ## Coefficients of linear discriminants:
    ##                 LD1     LD2
    ## sepal.length -2.148 -0.8024
    ## sepal.width   2.753 -2.1072
    ##
    ## Proportion of trace:
    ##    LD1    LD2
    ## 0.9628 0.0372

    plot(iris.lda)

_images/lda_fig_02.png

With corresponding decision boundaries

In[10]:

grid = np.mgrid[3.5:8:500j,2:5:400j]
gridT = grid.reshape((2,-1)).T

In[11]:

%%R -i gridT -o labels -d iris
colnames(gridT) = c('sepal.length', 'sepal.width')
gridT = data.frame(gridT)
labels = predict(iris.lda, gridT)$class

In[12]:

plt.imshow(labels.reshape((500,400)).T, interpolation='nearest', origin='lower', alpha=0.4, extent=[3.5,8,2,5], cmap=pylab.cm.RdYlGn)
plt.scatter(iris['sepal.length'], iris['sepal.width'], c=[col[t] for t in iris['iris.type']])
a = plt.gca()
a.set_xlim((3.5,8))
a.set_ylim((2,5))

Out[12]:

(2, 5)
_images/lda_fig_03.png
<matplotlib.figure.Figure at 0x573b450>

Finally, here is the result when viewing the first two discriminants, using all the data.

    iris.lda = lda(iris.type ~ ., data = iris)
    iris.lda

    ## Call:
    ## lda(iris.type ~ ., data = iris)
    ##
    ## Prior probabilities of groups:
    ##     Iris-setosa Iris-versicolor  Iris-virginica
    ##          0.3333          0.3333          0.3333
    ##
    ## Group means:
    ##                 sepal.length sepal.width petal.length petal.width
    ## Iris-setosa            5.006       3.418        1.464       0.244
    ## Iris-versicolor        5.936       2.770        4.260       1.326
    ## Iris-virginica         6.588       2.974        5.552       2.026
    ##
    ## Coefficients of linear discriminants:
    ##                  LD1      LD2
    ## sepal.length  0.8193  0.03286
    ## sepal.width   1.5479  2.15471
    ## petal.length -2.1849 -0.93025
    ## petal.width  -2.8539  2.80600
    ##
    ## Proportion of trace:
    ##    LD1    LD2
    ## 0.9915 0.0085

    plot(iris.lda)

_images/lda_fig_04.png

Quadratic discriminant analysis

The decision boundaries for QDA are (parts of) level sets of quadratic functions.

    iris.qda = qda(iris.type ~ petal.length + petal.width, data = iris)
    iris.qda

    ## Call:
    ## qda(iris.type ~ petal.length + petal.width, data = iris)
    ##
    ## Prior probabilities of groups:
    ##     Iris-setosa Iris-versicolor  Iris-virginica
    ##          0.3333          0.3333          0.3333
    ##
    ## Group means:
    ##                 petal.length petal.width
    ## Iris-setosa            1.464       0.244
    ## Iris-versicolor        4.260       1.326
    ## Iris-virginica         5.552       2.026

In[15]:

grid = np.mgrid[-0.5:3.5:500j,0:8:400j]
gridT = grid.reshape((2,-1)).T
gridT.shape

Out[15]:

(200000, 2)

In[16]:

%%R -i gridT -o labels -d iris
colnames(gridT) = c('petal.width', 'petal.length')
gridT = data.frame(gridT)
labels = predict(iris.qda, gridT)$class

In[17]:

plt.imshow(labels.reshape((500,400)), interpolation='nearest', origin='lower', alpha=0.4, extent=[0,8,-0.5,3], cmap=pylab.cm.RdYlGn)
plt.scatter(iris['petal.length'], iris['petal.width'], c=[col[t] for t in iris['iris.type']])
a = plt.gca()
a.set_xlim((0,8))
a.set_ylim((-0.5,3))

Out[17]:

(-0.5, 3)
_images/lda_fig_05.png
<matplotlib.figure.Figure at 0xc78bb70>

Here is the equivalent plot for sepal.length and sepal.width.

In[18]:

grid = np.mgrid[3.5:8:500j,2:5:400j]
gridT = grid.reshape((2,-1)).T

In[19]:

%%R -i gridT -o labels -d iris
colnames(gridT) = c('sepal.length', 'sepal.width')
iris.qda = qda(iris.type ~ sepal.length + sepal.width, data=iris)
gridT = data.frame(gridT)
labels = predict(iris.qda, gridT)$class

In[20]:

plt.imshow(labels.reshape((500,400)).T, interpolation='nearest', origin='lower', alpha=0.4, extent=[3.5,8,2,5], cmap=pylab.cm.RdYlGn)
plt.scatter(iris['sepal.length'], iris['sepal.width'], c=[col[t] for t in iris['iris.type']])
a = plt.gca()
a.set_xlim((3.5,8))
a.set_ylim((2,5))

Out[20]:

(2, 5)
_images/lda_fig_06.png
<matplotlib.figure.Figure at 0x8479170>

Finally, here is the result when we use all variables

    iris.qda = qda(iris.type ~ ., data = iris)
    iris.qda

    ## Call:
    ## qda(iris.type ~ ., data = iris)
    ##
    ## Prior probabilities of groups:
    ##     Iris-setosa Iris-versicolor  Iris-virginica
    ##          0.3333          0.3333          0.3333
    ##
    ## Group means:
    ##                 sepal.length sepal.width petal.length petal.width
    ## Iris-setosa            5.006       3.418        1.464       0.244
    ## Iris-versicolor        5.936       2.770        4.260       1.326
    ## Iris-virginica         6.588       2.974        5.552       2.026

    names(iris.qda)

    ##  [1] "prior"   "counts"  "means"   "scaling" "ldet"    "lev"     "N"
    ##  [8] "call"    "terms"   "xlevels"

Logistic regression

The LDA and QDA models are simple discriminant models. Another popular discriminant model is logistic regression. It can only be used for binary classification problems. Let’s try to distinguish Setosa apart from Virginica, Versicolor.

    Y = iris.type == "Iris-setosa"
    logistic.model = glm(Y ~ sepal.length + sepal.width, family = binomial())

    ## Warning: glm.fit: algorithm did not converge

    ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

    labels = predict(logistic.model, gridT, "response")

In[37]:

%R -o labels
col_new = {1:'r', 2:'y', 3:'y'}

plt.imshow(labels.reshape((500,400)).T, interpolation='nearest', origin='lower', alpha=0.4, extent=[3.5,8,2,5], cmap=pylab.cm.RdYlGn)
plt.scatter(iris['sepal.length'], iris['sepal.width'], c=[col_new[t] for t in iris['iris.type']])
a = plt.gca()
a.set_xlim((3.5,8))
a.set_ylim((2,5))

Out[37]:

(2, 5)
_images/lda_fig_07.png
<matplotlib.figure.Figure at 0x8479cd0>

It does quite well at separating Setosa from Virginica and Versicolor. But not quite as well for separating Versicolor from the other two.

    Z = iris.type == "Iris-versicolor"
    logistic.model = glm(Z ~ sepal.length + sepal.width, family = binomial())
    labels = predict(logistic.model, gridT, "response")

In[46]:

%R -o labels
col_new = {1:'r', 2:'g', 3:'r'}
plt.imshow(labels.reshape((500,400)).T, interpolation='nearest', origin='lower', alpha=0.4, extent=[3.5,8,2,5], cmap=pylab.cm.RdYlGn)
plt.scatter(iris['sepal.length'], iris['sepal.width'], c=[col_new[t] for t in iris['iris.type']])
a = plt.gca()
a.set_xlim((3.5,8))
a.set_ylim((2,5))

Out[46]:

(2, 5)
_images/lda_fig_08.png
<matplotlib.figure.Figure at 0xbfdaab0>

We see in this model that we actually have an estimated probability, which we will use for ROC curves below. To come up with a prediction, we might choose to threshold this probability at 0.5, say

In[47]:

plt.imshow(labels.reshape((500,400)).T > 0.5, interpolation='nearest', origin='lower', alpha=0.4, extent=[3.5,8,2,5], cmap=pylab.cm.RdYlGn)
plt.scatter(iris['sepal.length'], iris['sepal.width'], c=[col_new[t] for t in iris['iris.type']])
a = plt.gca()
a.set_xlim((3.5,8))
a.set_ylim((2,5))

Out[47]:

(2, 5)
_images/lda_fig_09.png
<matplotlib.figure.Figure at 0x67be990>

How about for separating Virginica?

    W = iris.type == "Iris-virginica"
    logistic.model = glm(W ~ sepal.length + sepal.width, family = binomial())
    labels = predict(logistic.model, gridT, "response")

In[52]:

%R -o labels
col_new = {1:'r', 2:'r', 3:'g'}
plt.imshow(labels.reshape((500,400)).T, interpolation='nearest', origin='lower', alpha=0.4, extent=[3.5,8,2,5], cmap=pylab.cm.RdYlGn)
plt.scatter(iris['sepal.length'], iris['sepal.width'], c=[col_new[t] for t in iris['iris.type']])
a = plt.gca()
a.set_xlim((3.5,8))
a.set_ylim((2,5))

Out[52]:

(2, 5)
_images/lda_fig_10.png
<matplotlib.figure.Figure at 0x573d710>

Of course, we can use all the variables as well

    logistic.model = glm(W ~ sepal.length + sepal.width + petal.length + petal.width,
        family = binomial())

    ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

    predictions = predict(logistic.model, type = "response")
    training_error = sum((predictions > 0.5) != W)
    training_error

    ## [1] 2

ROC curves

    library(ROCR)
    logistic.model = glm(Z ~ sepal.length + sepal.width + petal.length + petal.width,
        family = binomial())
    logistic.scores = predict(logistic.model, type = "response")
    logistic.rocr = prediction(logistic.scores, Z)
    plot(performance(logistic.rocr, "tpr", "fpr"), col = "red")

_images/lda_fig_11.png
    plot(performance(logistic.rocr, "tpr", "fpr"), col = "red")

    library(rpart)
    iris.rpart = rpart(Z ~ petal.length + petal.width + sepal.length + sepal.width,
        cp = 0.001, method = "class")

    # the second column are predicted probabilities for versicolor
    rpart.scores = predict(iris.rpart, type = "prob")[, 2]
    rpart.rocr = prediction(rpart.scores, Z)
    plot(performance(logistic.rocr, "tpr", "fpr"), col = "red")
    plot(performance(rpart.rocr, "tpr", "fpr"), col = "blue", add = TRUE)
    iris.lda = lda(iris.type ~ petal.length + petal.width + sepal.length + sepal.width)
    lda.rocr = prediction(lda.scores, Z)
    lda.scores = predict(iris.lda)$posterior[, 2]
    plot(performance(lda.rocr, "tpr", "fpr"), col = "green", add = TRUE)

_images/lda_fig_12.png