py4sci

Table Of Contents

Previous topic

PCA of olympic decathlon data

Next topic

Visual summaries

This Page

Distances

This example involves various distances and similarities on nominal and continuous data.

    wine.quality = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
        header = TRUE, sep = ";")
    head(wine.quality)

    ##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
    ## 1           7.4             0.70        0.00            1.9     0.076
    ## 2           7.8             0.88        0.00            2.6     0.098
    ## 3           7.8             0.76        0.04            2.3     0.092
    ## 4          11.2             0.28        0.56            1.9     0.075
    ## 5           7.4             0.70        0.00            1.9     0.076
    ## 6           7.4             0.66        0.00            1.8     0.075
    ##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
    ## 1                  11                   34  0.9978 3.51      0.56     9.4
    ## 2                  25                   67  0.9968 3.20      0.68     9.8
    ## 3                  15                   54  0.9970 3.26      0.65     9.8
    ## 4                  17                   60  0.9980 3.16      0.58     9.8
    ## 5                  11                   34  0.9978 3.51      0.56     9.4
    ## 6                  13                   40  0.9978 3.51      0.56     9.4
    ##   quality
    ## 1       5
    ## 2       5
    ## 3       5
    ## 4       6
    ## 5       5
    ## 6       5

    str(wine.quality)

    ## 'data.frame':        1599 obs. of  12 variables:
    ##  $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
    ##  $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
    ##  $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
    ##  $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
    ##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
    ##  $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
    ##  $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
    ##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
    ##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
    ##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
    ##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
    ##  $ quality             : int  5 5 5 6 5 5 5 7 7 5 ...

There are seemingly 10 continuous features and an ordinal variable quality. We will use just the first 10 to compute distances for now.

Here is a function that can compute any Minkowski distance matrix.

    minkowski = function(x, y, p = 2) {
        if ((p < Inf) & (p > 0)) {
            s = sum(abs(x - y)^p)^(1/p)
        } else if (p == Inf) {
            s = max(abs(x - y))
        } else {
            stop("p must be positive")
        }
        return(s)
    }

This function can compute the usual Euclidean distance

    minkowski(wine.quality[1, 1:10], wine.quality[2, 1:10], 2)

    ## [1] 35.86

    sqrt(sum((wine.quality[1, 1:10] - wine.quality[2, 1:10])^2))

    ## [1] 35.86

It can also compute the taxi-cab or \ell_1 distance:

    minkowski(wine.quality[1, 1:10], wine.quality[2, 1:10], 1)

    ## [1] 48.73

    sum(abs(wine.quality[1, 1:10] - wine.quality[2, 1:10]))

    ## [1] 48.73

As well as the max, or \ell_{\infty} distance

    minkowski(wine.quality[1, 1:10], wine.quality[2, 1:10], Inf)

    ## [1] 33

    max(abs(wine.quality[1, 1:10] - wine.quality[2, 1:10]))

    ## [1] 33

R knows these distances, and can give us the entire distance matrix with one call

    # Euclidean
    dist(wine.quality[1:5, 1:10], method = "euclidean")

    ##        1      2      3      4
    ## 2 35.858
    ## 3 20.406 16.405
    ## 4 26.964 11.213  7.227
    ## 5  0.000 35.858 20.406 26.964

    # Taxi-cab
    dist(wine.quality[1:5, 1:10], method = "manhattan")

    ##       1     2     3     4
    ## 2 48.73
    ## 3 25.26 23.56
    ## 4 37.15 20.42 12.99
    ## 5  0.00 48.73 25.26 37.15

    # Max
    dist(wine.quality[1:5, 1:10], method = "maximum")

    ##    1  2  3  4
    ## 2 33
    ## 3 20 13
    ## 4 26  8  6
    ## 5  0 33 20 26

Binary sequences

We will compute some distances using the voting data. The matching information on two sequences x and y can be described in terms of a 2 \times 2 table

    binary_table = function(x, y) {
        result = matrix(0, 2, 2)
        result[1, 1] = sum((x == 0) * (y == 0))
        result[1, 2] = sum((x == 0) * (y == 1))
        result[2, 1] = sum((x == 1) * (y == 0))
        result[2, 2] = sum((x == 1) * (y == 1))
        colnames(result) = c("y0", "y1")
        rownames(result) = c("x0", "x1")
        return(result)
    }

With this table, we can define the Simple Matching Coefficient (SMC)

    SMC = function(x, y) {
        bt = binary_table(x, y)
        return((bt[1, 1] + bt[2, 2])/sum(bt))
    }

Now, let’s compute this distance on two rows of the voting data. We will choose two members who are likely to be far apart Eric Cantor, a fairly partisan Republican and Debbie Wasserman Schultz, a partisan Democrat.

    full_vote = read.table("http://stats202.stanford.edu/data/2011_votes.csv", header = TRUE,
        sep = ";")
    full_vote$name[411]

    ## [1] Wasserman Schultz
    ## 426 Levels: Ackerman Adams Aderholt Akin Alexander Altmire ... Young (IN)

    full_vote$name[64]

    ## [1] Cantor
    ## 426 Levels: Ackerman Adams Aderholt Akin Alexander Altmire ... Young (IN)

Let’s extract their rows of votes from the table, removing the first column because it is “party”.

    cleaned = read.table("http://stats202.stanford.edu/data/2011_cleaned_votes.csv",
        header = TRUE, sep = ";")
    cantor = cleaned[64, 2:ncol(cleaned)]
    wschultz = cleaned[411, 2:ncol(cleaned)]

To only compare votes where they expressed a true preference, we will remove all zeros from both rows.

    keep = (cantor != 0) & (wschultz != 0)
    sum(keep)

    ## [1] 819

    cantor_k = cantor[keep]
    wschultz_k = wschultz[keep]

The values are \pm 1 so we change them to binary

    cantor_k = (cantor_k + 1)/2
    wschultz_k = (wschultz_k + 1)/2

    binary_table(cantor_k, wschultz_k)

    ##     y0  y1
    ## x0  92 277
    ## x1 320 130

    SMC(cantor_k, wschultz_k)

    ## [1] 0.2711

This shows that the two agreed on roughly 27% of the votes. Recall that the Hamming distance and the SMC are related by the formula

\text{SMC}(x,y) = 1 - \frac{\text{H}(x,y)}{\text{len}(x)}

    hamming = function(x, y) {
        return(sum(x != y))
    }

    SMC(cantor_k, wschultz_k)

    ## [1] 0.2711

    1 - hamming(cantor_k, wschultz_k)/sum(keep)

    ## [1] 0.2711

The Jaccard and cosine similarities are also easy to compute, as is the Extended Tanimoto Coefficient from the book

    jaccard = function(x, y) {
        bt = binary_table(x, y)
        return((bt[2, 2])/(bt[1, 2] + bt[2, 1] + bt[2, 2]))
    }
    jaccard(cantor_k, wschultz_k)

    ## [1] 0.1788

    cosine_sim = function(x, y) {
        return(sum(x * y)/sqrt(sum(x^2) * sum(y^2)))
    }
    cosine_sim(cantor_k, wschultz_k)

    ## [1] 0.3038

    tanimoto = function(x, y) {
        sxy = sum(x * y)
        return(sxy/(sum(x^2) + sum(y^2) - sxy))
    }
    tanimoto(cantor_k, wschultz_k)

    ## [1] 0.1788

The Jaccard similarity (actually, 1 - Jaccard) can be computed in R using the dist function.

    dist(rbind(cantor_k, wschultz_k), method = "binary")

    ##            cantor_k
    ## wschultz_k   0.8212

    1 - jaccard(cantor_k, wschultz_k)

    ## [1] 0.8212

Perhaps for politicians, the asymmetric interesting part is when they vote against a bill. What’s their Jaccard similarity when we look at 1-cantor, 1-wschulz?

    jaccard(1 - cantor_k, 1 - wschultz_k)

    ## [1] 0.1335

Health data

The health data has many different types of features. Here, we build a similarity matrix by combining similarities across the different features with a given set of weights.

    health = read.table("http://stats202.stanford.edu/data/health.csv", header = TRUE,
        sep = ",")
    str(health)

    ## 'data.frame':        24662 obs. of  14 variables:
    ##  $ country      : Factor w/ 3 levels "aeh","pih","ugh": 1 1 1 1 1 1 1 1 1 1 ...
    ##  $ age          : Factor w/ 6 levels "11-","12","13",..: 6 6 6 6 5 6 6 6 6 6 ...
    ##  $ sex          : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
    ##  $ height       : num  1.7 1.72 NA 1.65 1.72 1.71 1.63 1.59 1.67 1.74 ...
    ##  $ weight       : int  58 85 NA 51 52 88 70 56 125 54 ...
    ##  $ hungry       : Factor w/ 5 levels "Always","Most of the time",..: 5 3 3 3 3 4 1 4 3 3 ...
    ##  $ fruit        : Factor w/ 7 levels "<1","0","1","2",..: 1 1 1 1 1 1 4 1 4 4 ...
    ##  $ vegetables   : Factor w/ 7 levels "<1","0","1","2",..: 3 1 3 4 1 1 1 3 3 4 ...
    ##  $ teeth        : Factor w/ 6 levels "<1","0","1","2",..: 2 4 5 3 1 1 2 3 2 2 ...
    ##  $ hands_eating : Factor w/ 5 levels "Always","Most of the time",..: 1 1 1 1 1 1 1 1 1 1 ...
    ##  $ hands_toilet : Factor w/ 5 levels "Always","Most of the time",..: 1 2 1 1 1 1 1 1 1 1 ...
    ##  $ hands_soap   : Factor w/ 5 levels "Always","Most of the time",..: 5 1 1 1 2 1 1 1 1 1 ...
    ##  $ sample_weight: num  30.3 30.3 30.3 30.3 30.3 ...
    ##  $ bmi          : num  20.1 28.7 NA 18.7 17.6 ...

The function daisy in the cluster package will automatically combine numeric and nominal or ordinal variables.

    library(cluster)
    daisy(health[1:3, ])

    ## Dissimilarities :
    ##        1      2
    ## 2 0.5714
    ## 3 0.2727 0.2727
    ##
    ## Metric :  mixed ;  Types = N, N, N, I, I, N, N, N, N, N, N, N, I, I
    ## Number of objects : 3

The bottom line above shows what types of variables daisy used: N is for nominal, I is for interval. As the features are actually ordered, we should force R to recognize this.

    health$hungry = factor(health$hungry, levels = c("Never", "Rarely", "Sometimes",
        "Most of the time", "Always"), ordered = TRUE)
    health$age = factor(health$age, levels = c("11-", "12", "13", "14", "15", "16+"),
        ordered = TRUE)
    str(health)

    ## 'data.frame':        24662 obs. of  14 variables:
    ##  $ country      : Factor w/ 3 levels "aeh","pih","ugh": 1 1 1 1 1 1 1 1 1 1 ...
    ##  $ age          : Ord.factor w/ 6 levels "11-"<"12"<"13"<..: 6 6 6 6 5 6 6 6 6 6 ...
    ##  $ sex          : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
    ##  $ height       : num  1.7 1.72 NA 1.65 1.72 1.71 1.63 1.59 1.67 1.74 ...
    ##  $ weight       : int  58 85 NA 51 52 88 70 56 125 54 ...
    ##  $ hungry       : Ord.factor w/ 5 levels "Never"<"Rarely"<..: 3 1 1 1 1 2 5 2 1 1 ...
    ##  $ fruit        : Factor w/ 7 levels "<1","0","1","2",..: 1 1 1 1 1 1 4 1 4 4 ...
    ##  $ vegetables   : Factor w/ 7 levels "<1","0","1","2",..: 3 1 3 4 1 1 1 3 3 4 ...
    ##  $ teeth        : Factor w/ 6 levels "<1","0","1","2",..: 2 4 5 3 1 1 2 3 2 2 ...
    ##  $ hands_eating : Factor w/ 5 levels "Always","Most of the time",..: 1 1 1 1 1 1 1 1 1 1 ...
    ##  $ hands_toilet : Factor w/ 5 levels "Always","Most of the time",..: 1 2 1 1 1 1 1 1 1 1 ...
    ##  $ hands_soap   : Factor w/ 5 levels "Always","Most of the time",..: 5 1 1 1 2 1 1 1 1 1 ...
    ##  $ sample_weight: num  30.3 30.3 30.3 30.3 30.3 ...
    ##  $ bmi          : num  20.1 28.7 NA 18.7 17.6 ...

    daisy(health[1:3, ])

    ## Dissimilarities :
    ##        1      2
    ## 2 0.5714
    ## 3 0.2727 0.2727
    ##
    ## Metric :  mixed ;  Types = N, O, N, I, I, O, N, N, N, N, N, N, I, I
    ## Number of objects : 3

    names(health)

    ##  [1] "country"       "age"           "sex"           "height"
    ##  [5] "weight"        "hungry"        "fruit"         "vegetables"
    ##  [9] "teeth"         "hands_eating"  "hands_toilet"  "hands_soap"
    ## [13] "sample_weight" "bmi"

Correlation

Here are some sample plots of high and low correlation. Uncorrelated data are generated completely independently of each other.

    X = rnorm(40)
    Y = rnorm(40)
    plot(X, Y, pch = 23, bg = "red", cex = 2)

_images/distances_fig_00.png
    cor(X, Y)

    ## [1] 0.2389

    cosine_sim(X - mean(X), Y - mean(Y))

    ## [1] 0.2389

The following is high positive correlation.

    X = rnorm(40)
    Y = rnorm(40) + 3 * X
    plot(X, Y, pch = 23, bg = "red", cex = 2)

_images/distances_fig_01.png
    cor(X, Y)

    ## [1] 0.9487

    cosine_sim(X - mean(X), Y - mean(Y))

    ## [1] 0.9487

To get negative correlation we just need to change 3 to -3.

    X = rnorm(40)
    Y = rnorm(40) - 3 * X
    plot(X, Y, pch = 23, bg = "red", cex = 2)

_images/distances_fig_02.png
    cor(X, Y)

    ## [1] -0.9647

    cosine_sim(X - mean(X), Y - mean(Y))

    ## [1] -0.9647

Here is a sample with a moderate correlation

    X = rnorm(40)
    Y = rnorm(40) + X
    plot(X, Y, pch = 23, bg = "red", cex = 2)

_images/distances_fig_03.png
    cor(X, Y)

    ## [1] 0.778

    cosine_sim(X - mean(X), Y - mean(Y))

    ## [1] 0.778