#R-help to exercise 14
 
 
# Read the data into a dataframe, give names to the variables, and inspect the data:
rock=read.table("http://www.uio.no/studier/emner/matnat/math/STK4900/data/rock.dat",col.names=c("area","peri","shape","perm"))
rock
 
# Check that the data correspond to those given in the exercise.
 
# Attach the dataframe 
attach(rock)
 
 
# QUESTION a)
 
# Descriptive statistics, histogram and boxplot of "perm" (make one plot at a time):
summary(perm)
hist(perm)
boxplot(perm)
 
# How is the distribution of "perm"? Symetric or skewed?
 
 
# Try a log-transform:
summary(log(perm))
hist(log(perm))
boxplot(log(perm))
 
# How is the distribution of "log(perm)"? Symetric or skewed?
# (We choose to use "log(perm)" in question c.)
 
 
 
# QUESTION b)
 
# Compute the correlation between the three covariates and make scatterplots of the covariates:
cor(rock[,1:3])
plot(rock[,1:3])
 
# Is there a strong correlation between some of the covariates? 
# How could this affect the regression analysis?
 
 
 
# QUESTION c)
 
# Plot "log(perm)" versus the other covariates (make one plot at a time):
plot(area, log(perm))
plot(peri, log(perm))
plot(shape, log(perm))
 
# Which of the covariates seem to be of importance?
 
 
# Linear regression of ?log(perm)? using all three covariates:
# (the options "x=T" and "y=T" are included because of the computations in question d)
pfit1=lm(log(perm)~area+peri+shape, x=T, y=T)
summary(pfit1)
 
# Which of the covariates seem to be of importance?
# Is this in agreement with what you could see from the scatterplots?
 
 
 
# QUESTION d)
 
# According to their t-values the order of the covariates is: peri > area > shape
 
# Fit simpler regression models (according to the given order):
pfit2=lm(log(perm)~area+peri, x=T, y=T)
pfit3=lm(log(perm)~peri, x=T, y=T)
 
 
# Compute cross-validated R2:
cv.R2(pfit3)
cv.R2(pfit2)
cv.R2(pfit1)
 
# Which of the three models give the best prediction?
# Compare the cross-validated R2 with R2 from question c. What do you see?
 
 
 
# QUESTION e)
 
# Fit a model with second order terms and interactions:
pfit1=lm(log(perm)~area+peri+shape+I(area^2)+I(peri^2)+I(shape^2)+area:peri+area:shape+peri:shape, x=T, y=T)
summary(pfit1)
 
# Which variables seem to be most important? 
# According to their t-values the variables are ordered as: area:peri > area^2 > peri > area > peri^2 > area:shape > peri:shape > shape^2 > shape
 
# Fit models according to this ordering:
pfit2=lm(log(perm)~area+peri+I(area^2)+I(peri^2)+I(shape^2)+area:peri+area:shape+peri:shape,x=T,y=T)
pfit3=lm(log(perm)~area+peri+I(area^2)+I(peri^2)+area:peri+area:shape+peri:shape,x=T,y=T)
pfit4=lm(log(perm)~area+peri+I(area^2)+I(peri^2)+area:peri+area:shape,x=T,y=T)
pfit5=lm(log(perm)~area+peri+I(area^2)+I(peri^2)+area:peri,x=T,y=T)
pfit6=lm(log(perm)~area+peri+I(area^2)+area:peri,x=T,y=T)
pfit7=lm(log(perm)~peri+I(area^2)+area:peri,x=T,y=T)
pfit8=lm(log(perm)~I(area^2)+area:peri,x=T,y=T)
pfit9=lm(log(perm)~area:peri,x=T,y=T)
 
# Compute cross-validated R2 for the models:
cv.R2(pfit9)
cv.R2(pfit8)
cv.R2(pfit7)
cv.R2(pfit6)
cv.R2(pfit5)
cv.R2(pfit4)
cv.R2(pfit3)
cv.R2(pfit2)
cv.R2(pfit1)
 
 
# Which model gives the best prediction?
 
 
 
# QUESTION f)
 
# Use the same "recipe" as in question e 
# Which model gives the best prediction? Is it the same one as in question e?
 
 
 
# QUESTION g)
 
# Make various plots of the residuals. 
# See previous exercise for help with R-commands.