DF = read.csv( "../../Data/HOT.csv", header=TRUE, stringsAsFactors=FALSE ) ## Plot the requested scatter plot: ## plot(-DF$Longitude, DF$Latitude, type='p', cex=1.5, pch=19, xlim=c(-170, -50)) grid() ## Label some cities: ## nm = 'Miami FL' ext = grepl(nm, DF$Name) text(-DF$Longitude[ext], DF$Latitude[ext], labels=c(nm), pos=c(4)) nm = 'New York NY' ext = grepl(nm, DF$Name) text(-DF$Longitude[ext], DF$Latitude[ext], labels=c(nm), pos=c(4)) nm = 'Juneau AK' ext = grepl(nm, DF$Name) text(-DF$Longitude[ext], DF$Latitude[ext], labels=c(nm), pos=c(4)) nm = 'Honolulu HI' ext = grepl(nm, DF$Name) text(-DF$Longitude[ext], DF$Latitude[ext], labels=c(nm), pos=c(4)) ## Build a linear model: ## m1 = lm(MaxJanTemp ~ Latitude + Longitude + Altitude, data=DF) print(summary(m1)) plot(m1, which=1) ## Display the records with the largest residuals: ## print(DF[c(3, 30, 16), ]) ## Plot the hat matrix diagonal against the fitted values: ## inf = influence(m1) plot(m1$fitted.values, inf$hat, type='p', pch=19, xlab='Fitted values', ylab='Hat diagonals') ext = match(sort(inf$hat)[dim(DF)[1] - (0:3)], inf$hat) text(m1$fitted.values[ext], inf$hat[ext], labels=row.names(DF)[ext], pos=c(1, 4, 4, 4)) grid() ## Display the records with the largest influence: ## print(DF[c(16, 3, 60, 61), ]) ## Build a second linear model: ## m2 = lm(MaxJanTemp ~ Latitude + Longitude + Altitude + I(Latitude^2) + I(Longitude^2) + I(Altitude^2), data=DF) print(summary(m2))