#Note: the number sign begins a comment #Note: this file is not executable in batch mode because it # contains examples with prompts included, and output # that does not obey R syntax #If this is my first time with this project, I want to create #an R workspace. Immediately upon starting R, choose #File-->Save Workspace. Navigate to the location and hit enter. #(i.e., don't give it a name, just .RData) #Now your are working in that directory. #You do not need to repeat this step next time. Instead #use File-->Load Workspace to begin working with this #workspace. #Note: This file is not fully executable because it contains both #commands and output. For example, executing "[1] 240 10" below #(which is output) would produce an error. You must cut and #paste (or F5) the lines you wish to execute. #Assignment #constant #EDA examples: # Read in data # This command creates a 'data frame' # # data example: lake # # lake = ID number of the lake (10--40). # site = site number (1--8). # build = number of buildings per km of shoreline per lake. # ow1822 = relative exposure of site to winds over 18 m/s when the water is open (April to October) per site. # lnls = ln(density of logs +1) per site (response). # lnll = ln(density of logs +1) per lake. # sqrtts = square root of tree density per site (#/hectare). # sqrtbas = square root of total basal area per site (cm^2/hectare). # sqrtbal = total basal area per lake (averaged over sites) (cm^2/hectare). # sqrtttl = square root of tree density per lake (averaged over sites) (#/hectare). lake = read.table("lake.dat", header=T) #lakem = read.table("lakem.dat", na.strings=".", header=T) dim(lake) [1] 240 10 names(lake) [1] "lake" "site" "build" "ow1822" "lnls" "lnll" "sqrtts" [8] "sqrtbas" "sqrtbal" "sqrttl" # univariate summary statistics # clean out errors, if any summary(lake) # lake site build ow1822 # Min. :10.0 Min. :1.00 Min. : 0.00 Min. : 0.00 # 1st Qu.:17.0 1st Qu.:2.75 1st Qu.: 52.67 1st Qu.: 0.40 # Median :24.5 Median :4.50 Median : 94.09 Median : 5.45 # Mean :25.2 Mean :4.50 Mean :104.62 Mean :10.42 # 3rd Qu.:33.0 3rd Qu.:6.25 3rd Qu.:135.95 3rd Qu.:18.40 # Max. :42.0 Max. :8.00 Max. :416.80 Max. :38.90 # lnls lnll sqrtts sqrtbas # Min. :0.0000 Min. :0.000 Min. : 0.00 Min. : 0.0 # 1st Qu.:0.0000 1st Qu.:3.350 1st Qu.:24.49 1st Qu.: 384.3 # Median :0.6932 Median :4.210 Median :30.00 Median : 509.2 # Mean :1.1820 Mean :4.182 Mean :29.43 Mean : 500.7 # 3rd Qu.:2.0790 3rd Qu.:5.156 3rd Qu.:36.97 3rd Qu.: 651.2 # Max. :5.3520 Max. :6.486 Max. :50.99 Max. :1093.0 # NA's :2.0000 # sqrtbal sqrttl # Min. :377.6 Min. :19.36 # 1st Qu.:458.3 1st Qu.:27.61 # Median :519.1 Median :31.29 # Mean :532.6 Mean :30.85 # 3rd Qu.:621.4 3rd Qu.:34.64 # Max. :677.0 Max. :43.30 # The $ sign selects a particular column of the data, as long # as the data are formulated as a data.frame # Notice that NA is a special type of element lake$lnls # [1] 2.0790 1.3860 0.0000 0.6932 1.6090 0.0000 0.6932 0.6932 0.0000 0.0000 # [11] 0.0000 0.0000 1.6090 1.7920 1.0990 0.0000 1.0990 0.0000 0.0000 2.4850 # [21] 0.6932 0.6932 1.0990 1.6090 1.0990 2.5650 3.7140 1.0990 3.7610 3.2960 # [31] 2.8330 1.9460 1.3860 0.6932 1.9460 1.7920 0.0000 1.7920 2.3980 1.0990 # [41] 0.0000 0.6932 0.0000 0.6932 0.0000 0.6932 1.0990 0.0000 3.0910 1.3860 # [51] 0.0000 3.7610 1.9460 2.3980 1.0990 1.0990 2.6390 0.0000 1.0990 0.0000 # [61] 0.0000 2.5650 1.7920 0.6932 1.3860 0.0000 NA 0.6932 0.0000 2.8330 # [71] 2.1970 1.7920 0.0000 0.0000 0.0000 0.0000 0.0000 1.3860 0.6932 1.9460 # [81] 0.0000 0.0000 0.0000 2.5650 5.3520 0.0000 0.0000 0.0000 0.0000 2.3980 # [91] 4.4430 4.0430 4.0600 2.7730 3.6110 1.6090 3.4660 0.6932 0.0000 0.6932 # [101] NA 1.6090 3.1350 0.6932 0.0000 0.6932 0.0000 2.0790 2.1970 1.6090 # [111] 1.0990 0.0000 0.6932 3.3670 2.7080 3.5260 2.3980 3.9120 2.3030 0.0000 # [121] 0.6932 3.0910 2.1970 2.5650 3.3320 3.4010 2.5650 3.0910 1.6090 1.0990 # [131] 0.0000 2.1970 2.1970 2.6390 1.3860 0.0000 0.0000 0.0000 0.0000 0.0000 # [141] 0.6932 0.0000 1.9460 0.0000 0.0000 0.0000 0.0000 1.7920 1.3860 2.0790 # [151] 0.0000 1.9460 0.0000 1.6090 0.0000 0.6932 0.6932 0.0000 0.0000 0.0000 # [161] 1.7920 0.0000 1.3860 1.3860 2.3030 1.6090 1.9460 1.6090 0.0000 0.0000 # [171] 2.3980 2.7730 1.0990 2.1970 1.0990 0.0000 0.0000 0.0000 1.0990 2.3030 # [181] 0.0000 0.0000 0.0000 0.0000 3.4660 0.6932 1.0990 0.6932 2.4850 2.8330 # [191] 1.7920 1.0990 0.6932 2.3030 1.3860 0.0000 1.6090 0.0000 0.6932 1.3860 # [201] 0.0000 0.0000 0.0000 0.6932 0.0000 0.0000 0.0000 0.0000 0.6932 0.0000 # [211] 1.9460 0.0000 0.0000 2.3980 0.0000 0.0000 2.9960 2.3030 2.0790 2.5650 # [221] 3.1780 3.8070 2.4850 3.2580 0.0000 0.0000 0.0000 0.0000 2.6390 0.0000 # [231] 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 By default, NA contaminates anything it touches: mean(lake$lnls) [1] NA # univariate plots hist(lake$lnls) #right click on plot or *raise graph* and use File-->Save as #to save graph. The latter method can save as PDF plot(density(lake$lnls)) # Error in density.default(lake$lnls) : 'x' contains missing values # Error in plot(density(lake$lnls)) : error in evaluating the argument 'x' in selecting a method for function 'plot' plot(density(lake$lnls, na.rm=T)) boxplot(lake$lnls) plot(sort(lake$lnls)) # bivariate plots and statistics plot(lake$lnls~lake$build) pairs(lake) # univariate plots par(mfrow=c(2,2)) hist(lake$lnls, main="") plot(density(lake$lnls), main="") plot(density(lake$lnls, na.rm=T), main="") boxplot(lake$lnls) plot(sort(lake$lnls),main="") plot(lake$lnls~lake$build) pairs(lake) require(corrplot) M <- cor(lake,use= "complete.obs") corrplot(M,order="hclust", n.col.legend=7,color=TRUE,method="ellipse") #------------------------------------------------------------# cor(lake$lnls, lake$build) # Error in cor(lake$lnls, lake$build) : missing observations in cov/cor # many R functions have an optional argument na.rm=T, but cor() doesn't help(cor) cor(lake$lnls, lake$build, use="pairwise.complete.obs") [1] -0.3155732 round( cor(lake, use="pairwise.complete.obs"), 2) # lake site build ow1822 lnls lnll sqrtts sqrtbas sqrtbal sqrttl # lake 1.00 0.00 0.07 0.02 -0.12 -0.31 -0.07 0.04 0.14 -0.15 # site 0.00 1.00 -0.49 0.00 0.08 0.00 -0.05 0.15 0.00 0.00 # build 0.07 -0.49 1.00 0.13 -0.32 -0.24 -0.04 -0.11 0.03 -0.10 # ow1822 0.02 0.00 0.13 1.00 -0.41 -0.64 -0.05 0.13 0.24 -0.08 # lnls -0.12 0.08 -0.32 -0.41 1.00 0.57 0.24 0.09 -0.16 0.16 # lnll -0.31 0.00 -0.24 -0.64 0.57 1.00 0.17 -0.06 -0.20 0.32 # sqrtts -0.07 -0.05 -0.04 -0.05 0.24 0.17 1.00 0.65 0.32 0.56 # sqrtbas 0.04 0.15 -0.11 0.13 0.09 -0.06 0.65 1.00 0.44 0.31 # sqrtbal 0.14 0.00 0.03 0.24 -0.16 -0.20 0.32 0.44 1.00 0.56 # sqrttl -0.15 0.00 -0.10 -0.08 0.16 0.32 0.56 0.31 0.56 1.00 plot(lake$lnls~factor(lake$lake)) boxplot(split(lake$lnls,lake$lake)) coplot(sqrtts~sqrttl | site, lake, panel=function(x, y,...){panel.smooth(x, y, span = .8)} ) #-------------------------------------------------------- # # data example: eire # # towns: Towns/unit area. # ROADACC: arterial road network accessibility in 1961. # OWNCONS: percentage in value terms of gross agricultural output of each county consumed by itself. # POPCHG: 1961 population as percentage of 1926. # RETSALE: value of retail sales (?000). # INCOME: total personal income (?000). # names: County names # load package library(spdep) # load data data(eire) eire.df # A towns pale size ROADACC OWNCONS POPCHG RETSALE INCOME # Carlow 34.20 0.12 1 1087 3664 8.6 97 2962 7185 # Cavan 29.68 0.01 0 2133 5000 15.0 69 4452 9459 # ... # what is eire? help(eire) # subset data eire1 = eire.df[,c(2,5:9)] eire1 # eire1 # towns ROADACC OWNCONS POPCHG RETSALE INCOME # Carlow 0.12 3664 8.6 97 2962 7185 # Cavan 0.01 5000 15.0 69 4452 9459 # Clare 0.01 4321 19.0 78 3460 12435 # Cork 0.03 4118 9.0 90 28402 65901 # Donegal 0.03 7500 27.0 75 7478 17626 # Dublin 0.61 3078 9.4 142 89424 164631 # Galway 0.02 4537 21.9 88 8972 26950 dim(eire1) # univariate summary statistics # clean out errors, if any summary(eire1) # univariate plots par(mfrow=c(2,2)) hist(eire1$OWNCON, main="") plot(density(eire1$OWNCON), main="") boxplot(eire1$OWNCON) plot(sort(eire1$OWNCON),main="") # bivariate plots and statistics par(mfrow=c(1,1)) plot(eire1$OWNCON~eire1$POPCHG) pairs(eire1) cor(eire1$OWNCON, eire1$POPCHG) cor(eire1) #extract a value eire1[1,2] # [1] 3664 #extract a row or column eire1[2,] # towns ROADACC OWNCONS POPCHG RETSALE INCOME # Cavan 0.01 5000 15 69 4452 9459 #data type = logical bigger=eire1$ROADACC > 4000 bigger # [1] FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE TRUE # [13] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE # [25] FALSE FALSE sum(bigger) [1] 15 eire1$ROADACC[bigger] # [1] 5000 4321 4118 7500 4537 5140 5000 4018 4250 6815 4008 4500 4108 4500 5997 eire1[bigger,] # towns ROADACC OWNCONS POPCHG RETSALE INCOME # Cavan 0.01 5000 15.0 69 4452 9459 # Clare 0.01 4321 19.0 78 3460 12435 # Cork 0.03 4118 9.0 90 28402 65901 # Donegal 0.03 7500 27.0 75 7478 17626 # Galway 0.02 4537 21.9 88 8972 26950 # Kerry 0.01 5140 17.0 78 6341 20510 # Leitrim 0.01 5000 23.1 60 1885 5709 # Limerick 0.08 4018 11.4 95 10786 27395 # Longford 0.05 4250 19.0 77 1960 5297 # Mayo 0.01 6815 30.0 71 6758 19201 # Meath 0.59 4008 8.7 103 3356 14512 # Monaghan 0.00 4500 13.0 72 3960 8396 # Offaly 0.06 4108 14.3 98 3817 10320 # Roscommon 0.03 4500 23.0 71 2821 10223 # Sligo 0.02 5997 22.0 75 3535 9461 #also could use: subset(eire1,ROADACC>4000) eire2=eire1 eire2[2,2]=NA eire2[1:4,] whobad=is.na(eire2$ROADACC) eire2[!whobad,][1:4,] # ! means "not" #or,... subset(eire2,!is.na(ROADACC))[1:4,] #------------------------------------------ # maptools library(maptools) brks <- round(fivenum(eire.df$OWNCONS), digits=2) cols <- grey(4:1/5) par(mfrow=c(1,1)) plot(eire.polys.utm,col=cols[findInterval(eire.df$OWNCONS, brks)], forcefill=FALSE) title(main="Percentage own consumption of agricultural produce") legend(x=c(-50, 70), y=c(6120, 6050), legend=leglabs(brks), fill=cols, bty="n")