# Your TA will show you how to launch R. # For most of the lab sessions you will type things in on your own. This is # painful, but important. Later, you'll be able to cut/paste bits of R code # that you will be given. Even when you type things in an R window, note that # the up/down arrow keys scroll thru all the commands you have typed in already. # So, you don't have to type-in things that you've already typed-in. # First, here are some R basics: (57/276)^2+4.3*.001 # = 0.04695123 . Basic algebraic operations. ## Lines starting with "#" are comments. ## Create variables and assign them value: ## Variable names, function names, etc., should contain only alphanumeric ## characters (A-Z, a-z, 0-9) and the period ".". Take note, a name cannot ## start with a digit and ".". R is case-sensitive, i.e., "a" and "A" are ## different. Here are some more basic operations: x = 3 ## assignment can be done using either "=" or "<-" x y = 1 + 1 y log10(y) log2(y) sqrt(y) # Variables can be either number, vector, matrix, dataframe, character, or # logical expression. No need for their definitions here, for we will come # across them gradually. x = 1:5 # A few ways of entering data in R: x y = c (34, 30, 41, 35, 21) y mean(y) # sample mean of y -- measures "location." median(y) # sample median of y -- another measure of "location." sd(y) # sample standard deviation of y -- measures "spread." # We may not have done sd in class, yet, but don't worry. # For the following, use your up-arrow key. range(y) # Gives two numbers, min and max. range(y)[1] # First component of range(y), i.e., min range(y)[2] # Second ..., i.e., max. min(y) max(y) sort(y) ################################################################### ## To get help and examples on any function, say median: ?median # Explains and usually gives examples at end. # q will get you out of the help page. help.search("histogram") # searches all of R (on your computer). q quits. ################################################################### # R comes with a bunch of data sets. This is how you see what they are: data() # Now you know where to get data, if you want some! # This is how you read-in the R data sets: data() # lists all R data sets; q quits data(USArrests) # This loads a specific R dataset. ?USArrests # Gives info on data set; q quits. USArrests # Simply prints all the data onto the screen. names(USArrests) # Tells you the name of the variables. Use up-arrow. dim(USArrests) # Tells the (row X column) dimensions. Use up-arrow. USArrests$Murder # $ selects a given column/variable, by name. Use up-arrow. USArrests[,1] # Same as above, but selects by column number. Use up-arrow. murder = USArrests$Murder # Selects (by name) and assigns to some variable. murder # shows murder. x = USArrests[,1] # Selects (by column number) and assigns to some variable. x # Shows arrest. mean(x) # Use up-arrow. sd(x) # standard deviation of arrest. #################################################################### # Many of the hw problems will require reading in data from the CD. # You can either enter the data via one of the means above, e.g., x= c(1,2,3), # or read-in data from CD of your textbook, like this: dat = read.table(file.choose(),header=T) # For reading excel files, save the file as .csv, and then read it as dat = read.csv(file.choose(),header=F) # The following shows how you read-in data from a text file. # (At the bottom of this file, below, you can see how to change directory.) dat = read.table("http://www.stat.washington.edu/marzban/390/hist_dat.txt", header=F) dat ## Now, let's do some histograms on this data. # A histogram is simply a count (or frequency) plot of different values of a # variable. It answers the questions, for example, "How many x=1's occur in the # data?", "How many x=2's occur in the data?" Etc. That's easy when the variable # (x) is discrete. But what if it's continuous? Then, the answer to all of the # above questions is "zero." The only reasonable questions are of the type # "How many cases fall between 0 and 0.1? How many between 0.1 and 0.2?" Etc. # The 0.1 is called the class interval, or the bin size. What you will see # here is the effect of this bin size on the histogram. In R, instead of # specifying the bin size, it's more convenient to simply specify the # number of bins, specified by "breaks=". Here are some examples. x = dat[,1] # Selects the first column/variable in the file. x # Let's turn the "knob" on binsize and see what happens: # For the following, USE YOUR UP-ARROW. par(mfrow=c(3,3)) # This puts the 9 following histograms on 1 page. hist(x, breaks = 2) # looks useless hist(x, breaks = 3) hist(x, breaks = 4) # looks unimodal and bell-shaped. hist(x, breaks = 5) hist(x, breaks = 10) # looks bimodal. hist(x, breaks = 20) hist(x, breaks = 30) hist(x, breaks = 100) # looks bimodal + outlier hist(x, breaks = 10000) # looks useless # The above suggests that the data/variable x is probably made-up of # two different groups. For example, x could be "height", in which case # the two "humps" (at around breaks = 30 or 100 may be identified as the # heights of boys and girls, respectively. This type of analysis is a # simple form of data-mining, i.e. trying to figure out what's in the data. ################################################################## # Finally, how to make a "density scale" histogram: dat = read.table("http://www.stat.washington.edu/marzban/390/hist_dat.txt", header=F) x = dat[,1] # Recall that a density scale histogram is a relative frequency histogram where # the y-axis is also divided by the bin size. R does it like this: par(mfrow=c(1,2)) hist(x) # "regular" histogram hist(x,freq=FALSE) # density scale histogram # You can see that the shape is the same, but the advantage of the density # is that the area under it is 1. # By the way, since by default R takes the binsizes to be constant, the # above density histogram is also a relative frequency histogram. ############################################################### # Take qz1. q() # This will get you out of R altogether. #################################################################### #################################################################### # Time permitting: is.vector(x) # Checking to see if something is a vector. # To make a sequence of numbers, use c() or seq(): y = seq (from = 0, to = 1, by = 0.1) # Or simply, y = seq (0, 1, 0.1) # If you want the breaks in specfic places: hist(murder, breaks=seq(0, 20, by=3)) hist(murder, breaks=c(0,1,5,10,20)) # This is how you read-in data from a file (e.g. hist_dat.txt) in some # directory: setwd("G://Documents/2009winter/STAT 390/Lab") # This is an example!!! dat = read.table("hist_dat.txt", header=F) ###########################################################################