# The following will illustrate some of the concepts we introduced during last # week in Chapter 3. # 1) Scatterplots: # Let's pick a 100 random x values, and corresponding y values that have # some linear association with x. We'll change the amount of linear association # by adding different amounts of noise to y. Look: # Cut/paste the following 2 blocks from # http://www.stat.washington.edu/marzban/390/lab3_supp.txt x=runif(100,0,1) # Take 100 points from a unif dist between 0 and 1. hist(x) # you can see that it does look uniform. e=rnorm(100,0,0.1) # Make a normal variable, e, with mu=0, sigma=0.1 hist(e) # It looks normal. y1= 2*x # Make a perfect relation between x and y. y2=2*x + e # Ibid, but with some noise added . y3=2*x + rnorm(100,0,0.5) # Ibid, with more noise y4=2*x + rnorm(100,0,1.0) # Ibid. par(mfrow=c(2,2)) plot(x,y1) # USE UP-ARROW plot(x,y2) plot(x,y3) plot(x,y4) # Note that too much noise, e, makes it hard to see # the linear relationship between x and y. ############################################################################ # 2) Correlation # To quantify the strength of the association between two continuous variables, # Pearson's correlation coefficient (i.e., correlation), can be computed. cor(x,y1) # Check against the scatterplots, to get a feeling for r cor(x,y2) # USE UP-ARROW. cor(x,y3) cor(x,y4) ######################################################################## # 3) Defects of Correlation: # In class, I argued that r can be thrown off, and hence become misleading, # in several situations. Let me illustrate those claims: # Cut/paste the next block from the web, if you prefer. # It makes three sets of data: (x,y), (x1,y1), (x2,y2). # We'll look at each, below. set.seed(123) x = runif(100,0,1) e = rnorm(100,0,0.5) y = 1+ 2*x + e x1 = rnorm(100,0,50) y1 = rnorm(100,0,50) x2 = 1000 + rnorm(100,0,50) y2 = 1000 + rnorm(100,0,50) plot(x,y) cor(x,y) # Note the correlation of 0.7561632. x[101] = 0.2 # Now, one outlier can artificially reduce r. y[101] = 8.0 plot(x,y) cor(x,y) # to 0.51602. x[101] = 2.0 # A different outlier can artificially increase r. y[101] = 8.0 plot(x,y) cor(x,y) # to 0.8129318. # Clusters, too, can make r meaningless. Look! plot(x1,y1) cor(x1,y1) # No corr between x and y in cluster 1 plot(x2,y2) cor(x2,y2) # No corr between x and y in cluster 2 x = c(x1,x2) # Combine/concatenate the 2 clusters. y = c(y1,y2) plot(x,y) cor(x,y) # Now, r incorrectly sees a correlation between x and y. ######################################################################### # 4) Regression: # In dealing with two continuous variables, we've learned that the first thing # to do is to look at their scatterplot. The correlation coefficient # summarizes the strength of the relationship between them, but it does not # allow us to predict y from x. What does that is "regression" or the # line that fits "through" the scatterplot. # The function lm() in R does regression, i.e., it fits a line thru a # scatterplot, or a surface thru higher-dimensional data (later!). # Let us just do simple linear regression, i.e. a fit of just one pair of # variables, x and y. # Consider a fake/simulated example: rm(list=ls(all=TRUE)) set.seed(123) x = runif(100,0,1) # x is uniform between 0 and 1. e = rnorm(100,0,1) # error is normal with mean=0, sigma=1. y = 10 + 2*x + e # The real/true line is y = 10 + 2x. plot(x,y) # Here is the scatterplot, cor(x,y) # and the correlation between x and y. lm.1 = lm(y ~ x) # lm stands for linear model. lm.1 # Note that the estimated coefficients are pretty close # to the true ones (i.e., intercept=10, slope=2) abline(lm.1) # This draws the fit on the scatterplot. # If you want to know what else is contained in lm.1, do this: names(lm.1) ############################################## # Now, the example data from lecture: Compare answers there with those below. x = c(72,70,65,68,70) y = c(200,180,120,118,190) plot(x,y) cor(x,y) lm.1 = lm(y~x) abline(lm.1) # Draws the fit lm.1 # Gives you intercept and slope. summary(lm.1) # Gives that, and more (for later). ############################################# # TAs: Do NOT go over this in lab even if there is time. It's only for # the students to use, if they want to, on their own. # Superposition of histograms on a scatterplot: # In class, I mentioned that I may give you a piece of code that can # superpose histograms on scatterplots. It's somewhat involved, and so # don't waste time in trying to understand what it does. Just use it # on your own problems, and you'll see what needs to be changed. set.seed(123) x = runif(100,0,1) # Again, the well-behaved data from above. y = 10 + 2*x + rnorm(100,0,0.5) xhist = hist(x, breaks=seq(min(x),max(x)+0.1,0.1), plot=FALSE) yhist = hist(y, breaks=seq(min(y),max(y)+0.5,0.5), plot=FALSE) top = max(c(xhist$counts, yhist$counts)) xrange = c(min(x),max(x)+0.5 ) yrange = c(min(y),max(y)+0.5 ) nf = layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), c(3,1), c(1,3), TRUE) # layout.show(nf) par(mar=c(3,3,1,1)) plot(x, y, xlim=xrange, ylim=yrange, xlab="", ylab="") par(mar=c(0,3,1,1)) barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0) par(mar=c(3,0,1,1)) barplot(yhist$counts, axes=FALSE, xlim=c(0, top), space=0, horiz=TRUE) #########################################################################