## ## Prediction using simple linear regression ## ## -------------------------------------------------- ## Read and examine the data ## 1. ll <- read.csv('/~lmgonigl/materials-qm/data/lions.csv', stringsAsFactors=TRUE, strip.white=TRUE) ## 2. head(ll) ## 3. plot(x=ll$black, y=ll$age, xlab='Amount of black in nose', ylab='Age', pch=16, col='red', las=1) ## -------------------------------------------------- ## Fit a linear model ## 1. out <- lm(age~black, data=ll) ## also create a summary object (useful for quick extraction of ## parameter estimates) summ <- summary(out) summ ## 2. ## ## extract coefficients first int <- coef(summ)['(Intercept)','Estimate'] bla <- coef(summ)['black', 'Estimate'] abline(a=int, b=bla, col='blue') ## note that this can be done by simply plotting the out object with ## abline: abline(out, lwd=2, lty=2) ## The variance of the residuals for black in the nose tends to rise ## with increasing age but the trend is not severe. The residuals ## might not be quite normal, but they are not badly skewed so we are ## probably OK. ## 3. summary(out) ## R-squared = 0.6238 ## 4. confint(out, '(Intercept)', level=0.95) confint(out, 'black', level=0.95) ## 5. drop1(out, test='F') ## 6. plot(out) ## It is easier to see with these plots how the variance of the ## residuals tends to increase at higher fitted values. A ## transformation of one or both of the variables is usually the first ## course of action. R also has a toolkit for robust regression, ## which as the name suggests is more robust to violations of standard ## assumptions. ## -------------------------------------------------- ## Prediction ## 1. plot(x=ll$black, y=ll$age, xlab='Amount of black in nose', ylab='Age', pch=16, col='red', las=1) abline(out, lwd=2, col='blue') ## 2. new.vals <- data.frame(black=seq(min(ll$black),max(ll$black), length=20)) conf.arr <- predict(out, newdata=new.vals, interval='confidence', level=0.95) lines(x=new.vals[,'black'], y=conf.arr[,'lwr'], lwd=2, lty=2, col='blue') lines(x=new.vals[,'black'], y=conf.arr[,'upr'], lwd=2, lty=2, col='blue') ## 3. conf.arr <- predict(out, newdata=new.vals, interval='predict', level=0.95) lines(x=new.vals[,'black'], y=conf.arr[,'lwr'], lwd=2, lty=3, col='blue') lines(x=new.vals[,'black'], y=conf.arr[,'upr'], lwd=2, lty=3, col='blue') ## ## Effects of light treatment on circadian rhythms ## ## -------------------------------------------------- ## Read and examine the data ## 1. kk <- read.csv('/~lmgonigl/materials-qm/data/knees.csv', stringsAsFactors=TRUE, strip.white=TRUE) ## 2. head(kk) ## 3. class(kk[,'treatment']) ## 4. levels(kk[,'treatment']) ## having control first is useful, because the 'control' will then be ## used as the intercept, and all comparisons will be made relative to ## that ## 5. levels(kk[,'treatment']) <- levels(kk[,'treatment'])[c(1,3,2)] ## 6. plot(x=as.integer(kk[,'treatment']), y=kk[,'shift'], ylim=range(kk[,'shift']), xlim=c(0.75,3.25), pch=16, las=1, col='red', xaxt='n', xlab='', ylab='Shift') axis(1, at=1:3, labels=levels(kk[,'treatment'])) ## -------------------------------------------------- ## Fit a linear model ## 1. out <- lm(shift~treatment, data=kk) ## also create a summary object summ <- summary(out) ## 2. ## ## Making the plot manually (a good learning exercise) plot(x=jitter(as.integer(kk[,'treatment'])), y=kk[,'shift'], ylim=range(kk[,'shift']), xlim=c(0.75,3.25), pch=16, las=1, col='red', xaxt='n', xlab='', ylab='Shift') axis(1, at=1:3, labels=levels(kk[,'treatment'])) ## add lines contr <- coef(summ)['(Intercept)', 'Estimate'] trt.k <- contr + coef(summ)['treatmentknee','Estimate'] trt.e <- contr + coef(summ)['treatmenteyes','Estimate'] lines(x=c(0.85,1.15), y=c(contr,contr), col='blue', lwd=2) lines(x=c(1.85,2.15), y=c(trt.k,trt.k), col='blue', lwd=2) lines(x=c(2.85,3.15), y=c(trt.e,trt.e), col='blue', lwd=2) ## now that you've had some practice plotting manually, try the ## 'visreg' function, which does all the hard work for you! library(visreg) visreg(out, xvar='treatment') ## 3. plot(out) ## 4. model.matrix(out) ## One of the columns must be dropped because the information in the 4 ## columns is redundant in a particular way (a combination of three of ## the columns exactly equals the fourth). By default, R drops the ## column corresponding to the first level of the categorical ## variables which, in this case, is the control. ## 5. summ ## The mean of the first group and the differences between the second ## and third groups from the first. ## 6. ## library(emmeans) emmeans(out, spec='treatment') ## means are the same, because 'treatment' is the only predictor in ## the model mean(kk[,'shift'][kk[,'treatment']=='control']) mean(kk[,'shift'][kk[,'treatment']=='knee']) mean(kk[,'shift'][kk[,'treatment']=='eyes']) ## standard errors are not the same ## ## control ind <- kk[,'treatment']=='control' sd(kk[,'shift'][ind])/sqrt(sum(ind)) ## knee ind <- kk[,'treatment']=='knee' sd(kk[,'shift'][ind])/sqrt(sum(ind)) ## eyes ind <- kk[,'treatment']=='eyes' sd(kk[,'shift'][ind])/sqrt(sum(ind)) ## ## these are not the same as those from emmeans, because emmeans is ## using the entire model object to calculate these. ## 7. drop1(out, test='F') ## ## Fly sex and longevity revisited ## ## -------------------------------------------------- ## Read and examine the data ## 1. ff <- read.csv('/~lmgonigl/materials-qm/data/fruitflies.csv', stringsAsFactors=TRUE, strip.white=TRUE) ## 2. head(ff) ## 3. class(ff[,'treatment']) ## 4. levels(ff[,'treatment']) ## 5. ff[,'treatment'] <- factor(ff[,'treatment'], levels(ff[,'treatment'])[c(5,1:4)]) ## -------------------------------------------------- ## Fit a linear model ## 1. out <- lm(longevity.days~thorax.mm+treatment, data=ff) summ <- summary(out) summ ## 2. plot(out) ## 3. out <- lm(log(longevity.days)~thorax.mm+treatment, data=ff) summ <- summary(out) summ plot(out) ## 4. library(visreg) visreg(out, xvar='thorax.mm', by='treatment') visreg(out, xvar='treatment', by='thorax.mm') ## 5. summ ## '8 virgin females' differs most from the control group ## 6. library(emmeans) emmeans(out, spec='thorax.mm') emmeans(out, spec='treatment') ## 7. drop1(out, test='F') ## 8. out.with.int <- lm(log(longevity.days)~thorax.mm*treatment, data=ff) summ.with.int <- summary(out.with.int) summ.with.int plot(out.with.int) ## 9. drop1(out.with.int, test='F') ## there is no support for an interaction ## 10. ## ## adjusted R^2 from model without interaction summ$adj.r.squared ## and from model with interaction summ.with.int$adj.r.squared ## not that different...