# For the first few lab sessions, you will type things in on your own. # Later, you'll be able to cut and paste bits of R code that you will be given. # Even when you type things in, note that the up/down arrow keys scroll thru # all the commands you may have typed in already. So, you don't have to # type-in things that you've already typed-in. # First, here are some R basics: (57/276)^2+4.3*.001 # = 0.04695123 . Basic algebraic operations. ## Lines starting with "#" are comments. ##Create variables and assign them value ## Variable names, function names, etc., should contain only alphanumeric ## characters (A-Z, a-z, 0-9) and the period ".". Take note, a name cannot ## start with a digit and ".". R is case-sensitive, i.e., "a" and "A" are ## different. Here are some more basic operations: x <- 3 ## assignment can be done using either "=" or "<-" x y = 1 + 1 y log10(y); log2(y) ## Commands are separated by semicolons (;) or by a newline. sqrt(y) # Variables can be either number, vector, matrix, dataframe, character, or # logical expression. No need for serious definitions here, for we will come # across these gradually. x <- 1:5 # A few ways of entering data in R: x y <- c (34, 30, 41, 35, 21) y mean(y) ## sample average of y -- measures "location." median(y) ## sample median of y -- another measure of "location." sd(y) ## sample standard deviation of y -- measures "spread." # For the following, use your up-arrow key. range(y) # Gives two numbers, min and max. range(y)[1] # First component of range(y), i.e., min range(y)[2] # Second ..., i.e., max. min(y) max(y) sort(y) ################################################################### ## To get help and examples on any function, say median: ?median # Explains and gives examples at end, usually. # q will get you out of the help page. ################################################################### # R comes with a bunch of data sets. This is how you see what they are: data() # Now you know where to get data, if you want some! # This is how you read-in the R data sets: data(USArrests) ## Make the dataset available; q quits. ?USArrests # Gives info on data set; q quits. USArrests # Simply prints all the data onto the screen. names(USArrests) # Tells you the name of the variables. Use up-arrow. dim(USArrests) # Tells the (row X column) dimensions. Use up-arrow. USArrests$Murder # $ selects a given column/variable, by name. Use up-arrow. USArrests[,1] # Same as above, but selects by column number. Use up-arrow. murder = USArrests[,1] # Selects and assigns to some variable. murder # Shows arrest. mean(murder) # Use up-arrow. sd(murder) # standard deviation of arrest. #################################################################### # This is how you read-in data from a text file. (At the bottom of this file, # below, you can see how to read-in data from CD of your textbook, etc.) dat = read.table("http://www.stat.washington.edu/marzban/390/hist_dat.txt", header=F) dat ## Now, let's do some histograms on this data. # A histogram is simply a count (or frequency) plot of different values of a # variable. It answers the questions, for example, "How many x=1's occur in the # data?", "How many x=2's occur in the data?" Etc. That's easy when the variable # (x) is discrete. But what if it's continuous? Then, the answer to all of the # above questions is "zero." The only reasonable questions are of the type # "How many cases in the data are within 0.1 of x=1?" Etc. The 0.1 is called # the class interval, or the bin size. What you will see here is the effect # of this bin size on the histogram. In R, instead of specifying the bin size, # it's more convenient to simply specify the number of bins, specified by # "breaks=" or "nclass=". Here are some examples. x = dat[,1] # Selects the first column/variable in the file. x # Let's turn the "knob" on binsize and see what happens: # For the following, USE YOUR UP-ARROW. par(mfrow=c(3,3)) # This puts the 9 following histograms on 1 page. hist(x, breaks = 2) # looks useless hist(x, breaks = 3) hist(x, breaks = 4) # looks unimodal and bell-shaped. hist(x, breaks = 5) hist(x, breaks = 10) # looks bimodal. hist(x, breaks = 20) hist(x, breaks = 30) hist(x, breaks = 100) # looks bimodal + outlier hist(x, breaks = 10000) # looks useless # The above suggests that the data/variable x is probably made-up of # two different groups. For example, x could be "height", in which case # the two "humps" (at around breaks = 30 or 100 may be identified as the # heights of boys and girls, respectively. This type of analysis is a # simple form of data-mining, i.e. trying to figure out what's in the data. # Take qz1. TAs, scroll up and display on the screen the read.table line. q() # This will get you out of R altogether. #################################################################### # Extras: TAs, give this to students, but do not cover in class, # unless there is time. is.vector(x) # Checking to see if something is a vector. # To make a sequence of numbers, use c() or seq(): y <- seq (from = 0, to = 1, by = 0.1) # Or simply, y <- seq (0, 1, 0.1) help.search("poisson") # Searches for "possion" in all of R. # If you want the breaks in specfic places: hist(murder, breaks=seq(0, 20, by=3)) hist(murder, breaks=c(0,1,5,10,20)) # This is how you read-in data from CD of your textbook: read.table(file.choose(),header=T) # For reading excel files, save the file as .csv, and then read it as read.csv(file.choose(),header=F) # This is how you read-in data from a file (e.g. hist_dat.txt) in some # directory: setwd("G://Documents/2008AU/STAT 390/Lab") # This is an example!!! dat <- read.table("hist_dat.txt", header=F) # OR from the web dat <- read.table("http://www.stat.washington.edu/marzban/390/hist_dat.txt", header=F) ###########################################################################