Visual summaries

#### Next topic

Multidimensional arrays

# Multidimensional scaling¶

The following Olympic example shows the relation between MDS and PCA when the similarity matrix is a Euclidean one.

## Olympic data¶

    library(ade4)


    ## Attaching package: 'ade4'


    ## The following object(s) are masked from 'package:base':
##
## within


    data(olympic)
pca.olympic = princomp(olympic$tab) plot(pca.olympic$scores[, 1], pca.olympic$scores[, 2], pch = 23, bg = "red", xlab = "Score 1", ylab = "Score 2") Here are the MDS scores which seem to have an axis flipped.  X = as.matrix(olympic$tab)
similarity = X %*% t(X)
n = nrow(olympic$tab) D = sqrt((outer(rep(1, n), diag(similarity)) - 2 * similarity + outer(diag(similarity), rep(1, n)))) D = as.dist(D) MDS = cmdscale(D) plot(MDS[, 1], MDS[, 2], pch = 23, bg = "red", xlab = "Score 1", ylab = "Score 2") plot(MDS[, 1], pca.olympic$scores[, 1], pch = 23, bg = "red") plot(MDS[, 2], pca.olympic$scores[, 2], pch = 23, bg = "red") This exact relation between the MDS coordinates and PCA coordinates depends on the fact that we are using the Euclidean distance between cases. Let’s try the Manhattan distance.  D_L1 = dist(X, "manhattan") MDS_L1 = cmdscale(D_L1) plot(MDS_L1[, 1], MDS_L1[, 2], pch = 23, bg = "red") On inspection, it is not the same as before (which is not surprising). We see the scores are correlated, but not perfectly as before.  plot(MDS[, 1], MDS_L1[, 1], pch = 23, bg = "red") plot(MDS[, 2], MDS_L1[, 2], pch = 23, bg = "red") ## U.S. capitals¶ I formed a file of the distance matrix for the 50 U.S. capitals using Google’s map API following this example. In: import os, urllib, time, random  Rather than type state capitals, I copied a list In: capitals_str = """ Alabama - Montgomery<br> Alaska - Juneau<br> Arizona - Phoenix<br> Arkansas - Little Rock<br> California - Sacramento<br> Colorado - Denver<br> Connecticut - Hartford<br> Delaware - Dover<br> Florida - Tallahassee<br> Georgia - Atlanta<br> Hawaii - Honolulu<br> Idaho - Boise<br> Illinois - Springfield<br> Indiana - Indianapolis<br> Iowa - Des Moines<br> Kansas - Topeka<br> Kentucky - Frankfort<br> Louisiana - Baton Rouge<br> Maine - Augusta<br> Maryland - Annapolis<br> Massachusetts - Boston<br> Michigan - Lansing<br> Minnesota - St. Paul<br> Mississippi - Jackson<br> Missouri - Jefferson City<br> Montana - Helena<br> Nebraska - Lincoln<br> Nevada - Carson City<br> New Hampshire - Concord<br> New Jersey - Trenton<br> New Mexico - Santa Fe<br> New York - Albany<br> North Carolina - Raleigh<br> North Dakota - Bismarck<br> Ohio - Columbus<br> Oklahoma - Oklahoma City<br> Oregon - Salem<br> Pennsylvania - Harrisburg<br> Rhode Island - Providence<br> South Carolina - Columbia<br> South Dakota - Pierre<br> Tennessee - Nashville<br> Texas - Austin<br> Utah - Salt Lake City<br> Vermont - Montpelier<br> Virginia - Richmond<br> Washington - Olympia<br> West Virginia - Charleston<br> Wisconsin - Madison<br> Wyoming - Cheyenne """ state_dict = { 'AK': 'Alaska', 'AL': 'Alabama', 'AR': 'Arkansas', 'AS': 'American Samoa', 'AZ': 'Arizona', 'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DC': 'District of Columbia', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia', 'GU': 'Guam', 'HI': 'Hawaii', 'IA': 'Iowa', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 'MA': 'Massachusetts', 'MD': 'Maryland', 'ME': 'Maine', 'MI': 'Michigan', 'MN': 'Minnesota', 'MO': 'Missouri', 'MP': 'Northern Mariana Islands', 'MS': 'Mississippi', 'MT': 'Montana', 'NA': 'National', 'NC': 'North Carolina', 'ND': 'North Dakota', 'NE': 'Nebraska', 'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NV': 'Nevada', 'NY': 'New York', 'OH': 'Ohio', 'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'PR': 'Puerto Rico', 'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VA': 'Virginia', 'VI': 'Virgin Islands', 'VT': 'Vermont', 'WA': 'Washington', 'WI': 'Wisconsin', 'WV': 'West Virginia', 'WY': 'Wyoming' }  Next, I will form a list of capitals that google map will be able to return the coordinates of. In: capitals_str = capitals_str.replace("<br>","").strip() continental_50 = [] for k in state_dict: if state_dict[k] in capitals_str: continental_50.append(k) # this fails for Indianopolis unfortunately capitals_str = capitals_str.replace(state_dict[k], k) capitals = [','.join(c.split('-')[::-1]) for c in capitals_str.split('\n')]  Retrieve the results in blocks of 25. We do this because google restricts to 100 results every 10 seconds, 2500 results per 24 hours so we sleep for at least 15 seconds. We don’t need to retrieve the whole square matrix as it’s symmetric, we retrieve it in 5x5 blocks In: def get_all_distances(): if not os.path.exists(os.path.join('..', 'distances')): os.makedirs(os.path.join("..", "distances")) for i in range(10): for j in range(i+1): origins = "|".join(capitals[(i*5):(i+1)*5]) destinations = "|".join(capitals[(j*5):(j+1)*5]) url = "http://maps.googleapis.com/maps/api/distancematrix/json?origins=%s&destinations=%s&sensor=false" % (origins, destinations) # form a proper url url = url.replace(' ','+') fname = os.path.join('..', "distances", "results_%d%d.json" % (i,j)) if not os.path.exists(fname): results = urllib.urlopen(url).read() f = file(fname, 'w') f.write(results) f.close() time.sleep(random.randint(15,25)) print i, j get_all_distances()  Now, we parse all the resulting JSON files In: from glob import glob import simplejson import numpy as np, matplotlib.mlab as ML def make_data_matrix(): distance = {} duration = {} for f in glob("../distances/*json"): results = simplejson.load(file(f)) for o, row in zip(results['origin_addresses'], results['rows']): for d, col in zip(results['destination_addresses'], row['elements']): o_st = o.split(',')[-2].strip().upper().split(' ') d_st = d.split(',')[-2].strip().upper().split(' ') distance[(o_st,d_st)] = float(col['distance']['value']) distance[(d_st,o_st)] = distance[(o_st,d_st)] duration[(o_st,d_st)] = float(col['duration']['value']) duration[(d_st,o_st)] = duration[(o_st,d_st)] states = sorted(continental_50) distance_matrix = np.empty(50, [('state', '|S4')] + [(r, np.float) for r in states]) duration_matrix = np.empty(50, [('state', '|S4')] + [(r, np.float) for r in states]) for i, ostate in enumerate(states): distance_matrix['state'][i] = ostate duration_matrix['state'][i] = ostate for dstate in states: distance_matrix[dstate][i] = distance[(ostate, dstate)] duration_matrix[dstate][i] = duration[(ostate, dstate)] ML.rec2csv(distance_matrix, '../data/capital_distance.csv') ML.rec2csv(duration_matrix, '../data/capital_duration.csv') make_data_matrix()  Let’s make a map that we can view to compare our MDS plots to. First, we form a URL, which we paste into an img HTML tag as src below. In: url = """http://maps.googleapis.com/maps/api/staticmap?markers=color:blue|%(capitals)s&sensor=false&size=640x640&zoom=3""" % {'capitals':'|'.join(capitals[:50]).replace(' ','')} html = '<img src="%s">' % url html  Out: '<img src="http://maps.googleapis.com/maps/api/staticmap?markers=color:blue|Montgomery,AL|Juneau,AK|Phoenix,AZ|LittleRock,AR|Sacramento,CA|Denver,CO|Hartford,CT|Dover,DE|Tallahassee,FL|Atlanta,GA|Honolulu,HI|Boise,ID|Springfield,IL|INpolis,IN|DesMoines,IA|Topeka,KS|Frankfort,KY|BatonRouge,LA|Augusta,ME|Annapolis,MD|Boston,MA|Lansing,MI|St.Paul,MN|Jackson,MS|JeffersonCity,MO|Helena,MT|Lincoln,NE|CarsonCity,NV|Concord,NH|Trenton,NJ|SantaFe,NM|Albany,NY|Raleigh,NC|Bismarck,ND|Columbus,OH|OKCity,OK|Salem,OR|Harrisburg,PA|Providence,RI|Columbia,SC|Pierre,SD|Nashville,TN|Austin,TX|SaltLakeCity,UT|Montpelier,VT|Richmond,VA|Olympia,WA|Charleston,WV|Madison,WI|Cheyenne,WY&sensor=false&size=640x640&zoom=3">' U.S. Capitals  capitals = read.table("http://stats202.stanford.edu/data/capital_distance.csv", header = TRUE, sep = ",") MDS = cmdscale(as.dist(capitals[, -1])) plot(MDS, type = "n") text(MDS[, 1], MDS[, 2], names(capitals[, -1]), xlab = "", ylab = "") Let’s correct this East-West swap.  plot(MDS, type = "n") text(-MDS[, 1], MDS[, 2], names(capitals[, -1]), xlab = "", ylab = "") Not perfect, but not too bad. ## Iris data¶ Let’s take another look at how different distances affect the MDS coordinates.  iris = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", sep = ",", header = FALSE) names(iris) = c("sepal.length", "sepal.width", "petal.length", "petal.width", "iris.type")  First, we use Euclidean distance, yielding the usual (unstandardized) PCA scores.  D = dist(iris[, -5], method = "euclidean") MDS = cmdscale(D) plot(MDS[, 1], MDS[, 2], pch = 21, bg = c("red", "green", "blue")[unclass(iris$iris.type)]) How about or Manhattan?

    D = dist(iris[, -5], method = "manhattan")
MDS = cmdscale(D)
plot(MDS[, 1], MDS[, 2], pch = 21, bg = c("red", "green", "blue")[unclass(iris$iris.type)]) And the or max norm?  D = dist(iris[, -5], method = "maximum") MDS = cmdscale(D) plot(MDS[, 1], MDS[, 2], pch = 21, bg = c("red", "green", "blue")[unclass(iris$iris.type)]) Just for fun, let’s try some Minkowski distances. With , this should look close to D = dist(iris[, -5], method = "minkowski", p = 20)
MDS = cmdscale(D)
plot(MDS[, 1], MDS[, 2], pch = 21, bg = c("red", "green", "blue")[unclass(iris$iris.type)]) For small , it should look closer to . We can even take .  D = dist(iris[, -5], method = "minkowski", p = 0.2) MDS = cmdscale(D) plot(MDS[, 1], MDS[, 2], pch = 21, bg = c("red", "green", "blue")[unclass(iris$iris.type)]) In short, it seems easy to distinguish setosa from virginica and versicolor but these two may be harder to separate.