py4sci

Table Of Contents

Previous topic

Some help for R

Next topic

Features for time series

This Page

Discretization of a continuous score

Simple discretization

This example demonstrates various ways to discretize a continuous variable.

    data = read.table("http://stats202.stanford.edu/data/sample_kmean1.csv")
    X = data$V1
    head(X)

    ## [1] -3.1034 -0.9578 -2.9519 -3.4504 -1.1118 -3.3890

This data is bimodal, with a plot of the density suggesting a cut at 0 would be a good partition.

    plot(density(X))

_images/discretization_fig_00.png

To discretize on a predetermined set of values, we use the function cut.

    intervals_of_length_2 = cut(X, seq(-6, 6, by = 2))
    intervals_of_length_2

    ##   [1] (-4,-2] (-2,0]  (-4,-2] (-4,-2] (-2,0]  (-4,-2] (-2,0]  (-6,-4]
    ##   [9] (-4,-2] (-4,-2] (-4,-2] (-4,-2] (-4,-2] (-4,-2] (-4,-2] (-4,-2]
    ##  [17] (-4,-2] (-4,-2] (-4,-2] (-6,-4] (-6,-4] (-2,0]  (-6,-4] (-4,-2]
    ##  [25] (-6,-4] (-4,-2] (-4,-2] (-4,-2] (-4,-2] (-4,-2] (-4,-2] (-4,-2]
    ##  [33] (-4,-2] (-4,-2] (-6,-4] (-6,-4] (-4,-2] (-4,-2] (-4,-2] (-4,-2]
    ##  [41] (-6,-4] (-2,0]  (-4,-2] (-2,0]  (-2,0]  (-4,-2] (-4,-2] (-4,-2]
    ##  [49] (-2,0]  (-4,-2] (-2,0]  (-4,-2] (-6,-4] (-6,-4] (-4,-2] (-4,-2]
    ##  [57] (-2,0]  (-4,-2] (-4,-2] (-6,-4] (-4,-2] (-6,-4] (-6,-4] (-4,-2]
    ##  [65] (-4,-2] (-4,-2] (-4,-2] (-2,0]  (-4,-2] (-4,-2] (-4,-2] (-4,-2]
    ##  [73] (-4,-2] (-6,-4] (-6,-4] (-4,-2] (-4,-2] (-4,-2] (-4,-2] (-4,-2]
    ##  [81] (-4,-2] (-4,-2] (-4,-2] (-4,-2] (-4,-2] (-4,-2] (-4,-2] (-6,-4]
    ##  [89] (-4,-2] (-4,-2] (-4,-2] (-4,-2] (-4,-2] (-4,-2] (-4,-2] (-4,-2]
    ##  [97] (-4,-2] (-4,-2] (-6,-4] (-6,-4] (2,4]   (2,4]   (2,4]   (2,4]
    ## [105] (2,4]   (2,4]   (2,4]   (2,4]   (0,2]   (2,4]   (2,4]   (2,4]
    ## [113] (2,4]   (2,4]   (2,4]   (2,4]   (4,6]   (2,4]   (2,4]   (2,4]
    ## [121] (2,4]   (2,4]   (2,4]   (0,2]   (0,2]   (4,6]   (2,4]   (2,4]
    ## [129] (4,6]   (2,4]   (2,4]   (2,4]   (2,4]   (2,4]   (0,2]   (2,4]
    ## [137] (2,4]   (2,4]   (0,2]   (2,4]   (2,4]   (2,4]   (2,4]   (2,4]
    ## [145] (2,4]   (2,4]   (2,4]   (2,4]   (2,4]   (2,4]   (4,6]   (2,4]
    ## [153] (4,6]   (2,4]   (2,4]   (2,4]   (2,4]   (2,4]   (2,4]   (2,4]
    ## [161] (0,2]   (2,4]   (2,4]   (2,4]   (2,4]   (2,4]   (2,4]   (2,4]
    ## [169] (2,4]   (4,6]   (2,4]   (2,4]   (2,4]   (2,4]   (2,4]   (2,4]
    ## [177] (2,4]   (2,4]   (0,2]   (2,4]   (2,4]   (4,6]   (2,4]   (2,4]
    ## [185] (0,2]   (2,4]   (4,6]   (0,2]   (2,4]   (2,4]   (2,4]   (2,4]
    ## [193] (2,4]   (2,4]   (2,4]   (4,6]   (0,2]   (2,4]   (2,4]   (4,6]
    ## Levels: (-6,-4] (-4,-2] (-2,0] (0,2] (2,4] (4,6]

    summary(intervals_of_length_2)

    ## (-6,-4] (-4,-2]  (-2,0]   (0,2]   (2,4]   (4,6]
    ##      18      71      11      10      80      10

We can cut with quartiles as follows:

    by_quartile = cut(X, quantile(X, c(0, 0.25, 0.5, 0.75, 1)))
    by_quartile

    ##   [1] (-5.63,-3.06] (-3.06,0.215] (-3.06,0.215] (-5.63,-3.06] (-3.06,0.215]
    ##   [6] (-5.63,-3.06] (-3.06,0.215] (-5.63,-3.06] (-3.06,0.215] (-5.63,-3.06]
    ##  [11] (-3.06,0.215] (-3.06,0.215] (-3.06,0.215] (-5.63,-3.06] (-5.63,-3.06]
    ##  [16] (-3.06,0.215] (-5.63,-3.06] (-3.06,0.215] (-3.06,0.215] (-5.63,-3.06]
    ##  [21] (-5.63,-3.06] (-3.06,0.215] (-5.63,-3.06] (-3.06,0.215] (-5.63,-3.06]
    ##  [26] (-5.63,-3.06] (-3.06,0.215] (-3.06,0.215] (-3.06,0.215] (-3.06,0.215]
    ##  [31] (-3.06,0.215] (-3.06,0.215] (-5.63,-3.06] (-5.63,-3.06] (-5.63,-3.06]
    ##  [36] (-5.63,-3.06] (-3.06,0.215] (-3.06,0.215] (-3.06,0.215] (-3.06,0.215]
    ##  [41] (-5.63,-3.06] (-3.06,0.215] (-5.63,-3.06] (-3.06,0.215] (-3.06,0.215]
    ##  [46] (-5.63,-3.06] (-3.06,0.215] (-3.06,0.215] (-3.06,0.215] (-5.63,-3.06]
    ##  [51] (-3.06,0.215] (-3.06,0.215] <NA>          (-5.63,-3.06] (-3.06,0.215]
    ##  [56] (-5.63,-3.06] (-3.06,0.215] (-3.06,0.215] (-5.63,-3.06] (-5.63,-3.06]
    ##  [61] (-5.63,-3.06] (-5.63,-3.06] (-5.63,-3.06] (-5.63,-3.06] (-5.63,-3.06]
    ##  [66] (-3.06,0.215] (-5.63,-3.06] (-3.06,0.215] (-3.06,0.215] (-3.06,0.215]
    ##  [71] (-5.63,-3.06] (-5.63,-3.06] (-5.63,-3.06] (-5.63,-3.06] (-5.63,-3.06]
    ##  [76] (-3.06,0.215] (-5.63,-3.06] (-5.63,-3.06] (-3.06,0.215] (-5.63,-3.06]
    ##  [81] (-3.06,0.215] (-3.06,0.215] (-3.06,0.215] (-3.06,0.215] (-5.63,-3.06]
    ##  [86] (-5.63,-3.06] (-3.06,0.215] (-5.63,-3.06] (-3.06,0.215] (-3.06,0.215]
    ##  [91] (-5.63,-3.06] (-5.63,-3.06] (-3.06,0.215] (-3.06,0.215] (-5.63,-3.06]
    ##  [96] (-5.63,-3.06] (-3.06,0.215] (-5.63,-3.06] (-5.63,-3.06] (-5.63,-3.06]
    ## [101] (3.07,5.26]   (3.07,5.26]   (3.07,5.26]   (0.215,3.07]  (0.215,3.07]
    ## [106] (0.215,3.07]  (0.215,3.07]  (3.07,5.26]   (0.215,3.07]  (0.215,3.07]
    ## [111] (0.215,3.07]  (0.215,3.07]  (0.215,3.07]  (0.215,3.07]  (3.07,5.26]
    ## [116] (0.215,3.07]  (3.07,5.26]   (3.07,5.26]   (3.07,5.26]   (3.07,5.26]
    ## [121] (0.215,3.07]  (0.215,3.07]  (3.07,5.26]   (0.215,3.07]  (0.215,3.07]
    ## [126] (3.07,5.26]   (0.215,3.07]  (3.07,5.26]   (3.07,5.26]   (0.215,3.07]
    ## [131] (3.07,5.26]   (0.215,3.07]  (0.215,3.07]  (3.07,5.26]   (0.215,3.07]
    ## [136] (3.07,5.26]   (3.07,5.26]   (0.215,3.07]  (0.215,3.07]  (3.07,5.26]
    ## [141] (3.07,5.26]   (0.215,3.07]  (3.07,5.26]   (3.07,5.26]   (3.07,5.26]
    ## [146] (3.07,5.26]   (3.07,5.26]   (0.215,3.07]  (0.215,3.07]  (0.215,3.07]
    ## [151] (3.07,5.26]   (0.215,3.07]  (3.07,5.26]   (0.215,3.07]  (3.07,5.26]
    ## [156] (0.215,3.07]  (3.07,5.26]   (0.215,3.07]  (3.07,5.26]   (3.07,5.26]
    ## [161] (0.215,3.07]  (3.07,5.26]   (3.07,5.26]   (3.07,5.26]   (3.07,5.26]
    ## [166] (0.215,3.07]  (3.07,5.26]   (3.07,5.26]   (3.07,5.26]   (3.07,5.26]
    ## [171] (3.07,5.26]   (3.07,5.26]   (0.215,3.07]  (0.215,3.07]  (0.215,3.07]
    ## [176] (3.07,5.26]   (0.215,3.07]  (0.215,3.07]  (0.215,3.07]  (0.215,3.07]
    ## [181] (0.215,3.07]  (3.07,5.26]   (3.07,5.26]   (3.07,5.26]   (0.215,3.07]
    ## [186] (0.215,3.07]  (3.07,5.26]   (0.215,3.07]  (3.07,5.26]   (0.215,3.07]
    ## [191] (0.215,3.07]  (0.215,3.07]  (3.07,5.26]   (0.215,3.07]  (0.215,3.07]
    ## [196] (3.07,5.26]   (0.215,3.07]  (3.07,5.26]   (0.215,3.07]  (3.07,5.26]
    ## Levels: (-5.63,-3.06] (-3.06,0.215] (0.215,3.07] (3.07,5.26]

    summary(by_quartile)

    ## (-5.63,-3.06] (-3.06,0.215]  (0.215,3.07]   (3.07,5.26]          NA's
    ##            49            50            50            50             1

Using clustering: K-means

Let’s use a clustering algorithm to partition the variable into two groups.

    X_by_kmeans = kmeans(X, 2)
    X_by_kmeans

    ## K-means clustering with 2 clusters of sizes 100, 100
    ##
    ## Cluster means:
    ##     [,1]
    ## 1  3.040
    ## 2 -3.062
    ##
    ## Clustering vector:
    ##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
    ##  [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
    ##  [71] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1
    ## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
    ## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
    ## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
    ##
    ## Within cluster sum of squares by cluster:
    ## [1] 68.70 89.95
    ##  (between_SS / total_SS =  92.1 %)
    ##
    ## Available components:
    ##
    ## [1] "cluster"      "centers"      "totss"        "withinss"
    ## [5] "tot.withinss" "betweenss"    "size"