library(tidyverse)
We’ll use a built in data set, mtcars
data(mtcars)
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
We’ve covered some simple measures before, like mean and summary:
mean(mtcars$mpg)
## [1] 20.09062
summary(mtcars$mpg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.40 15.42 19.20 20.09 22.80 33.90
And we’ve made frequency tables of values:
table(mtcars$gear)
##
## 3 4 5
## 15 12 5
Although we can go beyond 1 variable to 2 or 3:
table(mtcars$gear, mtcars$am) # rows, cols
##
## 0 1
## 3 15 0
## 4 4 8
## 5 0 5
ftable makes output for 3 dimension tables a bit easier to read
ftable(mtcars$gear, mtcars$am, mtcars$cyl) # rows, subrows, columns
## 4 6 8
##
## 3 0 1 2 12
## 1 0 0 0
## 4 0 2 2 0
## 1 6 2 0
## 5 0 0 0 0
## 1 2 1 2
or proportion tables:
prop.table(table(mtcars$gear))
##
## 3 4 5
## 0.46875 0.37500 0.15625
And we’ve learned we can group our data and summarize by group too
mtcars %>%
group_by(am) %>%
summarize(mean(mpg),
mean(hp))
## # A tibble: 2 x 3
## am `mean(mpg)` `mean(hp)`
## <dbl> <dbl> <dbl>
## 1 0 17.14737 160.2632
## 2 1 24.39231 126.8462
What else can we do?
Correlation between two variables
cor(mtcars$mpg, mtcars$disp)
## [1] -0.8475514
Or all pairwise comparisons:
cor(mtcars)
## mpg cyl disp hp drat wt
## mpg 1.0000000 -0.8521620 -0.8475514 -0.7761684 0.68117191 -0.8676594
## cyl -0.8521620 1.0000000 0.9020329 0.8324475 -0.69993811 0.7824958
## disp -0.8475514 0.9020329 1.0000000 0.7909486 -0.71021393 0.8879799
## hp -0.7761684 0.8324475 0.7909486 1.0000000 -0.44875912 0.6587479
## drat 0.6811719 -0.6999381 -0.7102139 -0.4487591 1.00000000 -0.7124406
## wt -0.8676594 0.7824958 0.8879799 0.6587479 -0.71244065 1.0000000
## qsec 0.4186840 -0.5912421 -0.4336979 -0.7082234 0.09120476 -0.1747159
## vs 0.6640389 -0.8108118 -0.7104159 -0.7230967 0.44027846 -0.5549157
## am 0.5998324 -0.5226070 -0.5912270 -0.2432043 0.71271113 -0.6924953
## gear 0.4802848 -0.4926866 -0.5555692 -0.1257043 0.69961013 -0.5832870
## carb -0.5509251 0.5269883 0.3949769 0.7498125 -0.09078980 0.4276059
## qsec vs am gear carb
## mpg 0.41868403 0.6640389 0.59983243 0.4802848 -0.55092507
## cyl -0.59124207 -0.8108118 -0.52260705 -0.4926866 0.52698829
## disp -0.43369788 -0.7104159 -0.59122704 -0.5555692 0.39497686
## hp -0.70822339 -0.7230967 -0.24320426 -0.1257043 0.74981247
## drat 0.09120476 0.4402785 0.71271113 0.6996101 -0.09078980
## wt -0.17471588 -0.5549157 -0.69249526 -0.5832870 0.42760594
## qsec 1.00000000 0.7445354 -0.22986086 -0.2126822 -0.65624923
## vs 0.74453544 1.0000000 0.16834512 0.2060233 -0.56960714
## am -0.22986086 0.1683451 1.00000000 0.7940588 0.05753435
## gear -0.21268223 0.2060233 0.79405876 1.0000000 0.27407284
## carb -0.65624923 -0.5696071 0.05753435 0.2740728 1.00000000
Alternatively, covariance
cov(mtcars)
## mpg cyl disp hp drat
## mpg 36.324103 -9.1723790 -633.09721 -320.732056 2.19506351
## cyl -9.172379 3.1895161 199.66028 101.931452 -0.66836694
## disp -633.097208 199.6602823 15360.79983 6721.158669 -47.06401915
## hp -320.732056 101.9314516 6721.15867 4700.866935 -16.45110887
## drat 2.195064 -0.6683669 -47.06402 -16.451109 0.28588135
## wt -5.116685 1.3673710 107.68420 44.192661 -0.37272073
## qsec 4.509149 -1.8868548 -96.05168 -86.770081 0.08714073
## vs 2.017137 -0.7298387 -44.37762 -24.987903 0.11864919
## am 1.803931 -0.4657258 -36.56401 -8.320565 0.19015121
## gear 2.135685 -0.6491935 -50.80262 -6.358871 0.27598790
## carb -5.363105 1.5201613 79.06875 83.036290 -0.07840726
## wt qsec vs am gear
## mpg -5.1166847 4.50914919 2.01713710 1.80393145 2.1356855
## cyl 1.3673710 -1.88685484 -0.72983871 -0.46572581 -0.6491935
## disp 107.6842040 -96.05168145 -44.37762097 -36.56401210 -50.8026210
## hp 44.1926613 -86.77008065 -24.98790323 -8.32056452 -6.3588710
## drat -0.3727207 0.08714073 0.11864919 0.19015121 0.2759879
## wt 0.9573790 -0.30548161 -0.27366129 -0.33810484 -0.4210806
## qsec -0.3054816 3.19316613 0.67056452 -0.20495968 -0.2804032
## vs -0.2736613 0.67056452 0.25403226 0.04233871 0.0766129
## am -0.3381048 -0.20495968 0.04233871 0.24899194 0.2923387
## gear -0.4210806 -0.28040323 0.07661290 0.29233871 0.5443548
## carb 0.6757903 -1.89411290 -0.46370968 0.04637097 0.3266129
## carb
## mpg -5.36310484
## cyl 1.52016129
## disp 79.06875000
## hp 83.03629032
## drat -0.07840726
## wt 0.67579032
## qsec -1.89411290
## vs -0.46370968
## am 0.04637097
## gear 0.32661290
## carb 2.60887097
Do manual and automatic transmission cars have different mean miles per gallon?
t.test(mtcars$mpg~mtcars$am)
##
## Welch Two Sample t-test
##
## data: mtcars$mpg by mtcars$am
## t = -3.7671, df = 18.332, p-value = 0.001374
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -11.280194 -3.209684
## sample estimates:
## mean in group 0 mean in group 1
## 17.14737 24.39231
Think of ~ as “as a function of”. So above, MPG as a function of transmission, meaning grouped by or explained by.
We don’t have to use the formula syntax. We can send data for two different groups:
t.test(mtcars$mpg[mtcars$am==0], mtcars$mpg[mtcars$am==1])
In addition to the printout, you can get the various parts of the results
t1 <- t.test(mtcars$mpg~mtcars$am)
names(t1)
## [1] "statistic" "parameter" "p.value" "conf.int" "estimate"
## [6] "null.value" "alternative" "method" "data.name"
t1$statistic
## t
## -3.767123
Let’s predict MPG using a linear model. First, a simple bivariate regression – just one explanatory variable.
Basic syntax is
lm(y ~ x1 + x2 + x3, data=df)
lm(mpg ~ hp, data=mtcars)
##
## Call:
## lm(formula = mpg ~ hp, data = mtcars)
##
## Coefficients:
## (Intercept) hp
## 30.09886 -0.06823
Default output isn’t much. You get a lot more with summary:
r1 <- lm(mpg ~ hp, data=mtcars)
summary(r1)
##
## Call:
## lm(formula = mpg ~ hp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.7121 -2.1122 -0.8854 1.5819 8.2360
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.09886 1.63392 18.421 < 2e-16 ***
## hp -0.06823 0.01012 -6.742 1.79e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.863 on 30 degrees of freedom
## Multiple R-squared: 0.6024, Adjusted R-squared: 0.5892
## F-statistic: 45.46 on 1 and 30 DF, p-value: 1.788e-07
Note that a constant (Intercept) term was added automatically.
As above, you can pull out the individual components as needed
names(r1)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
plot(mtcars$mpg ~ mtcars$hp)
abline(r1)
Label points with
identify(mtcars$hp, mtcars$mpg,
labels=rownames(mtcars))
And visualizing diagnostics:
plot(r1)
How do we add more variables? What about transforming variables? Here is some of the formula syntax:
| Symbol | Example | Description |
|---|---|---|
~ |
y ~ x1 |
Defines the formula (necessary to create a formula object) |
+ |
y ~ x1 + x2 |
Include the variable |
- |
y ~ -1 +x1 |
Delete a term, usually a 1 for the intercept |
: |
y ~ x1 + x1:x2 |
Interaction term |
* |
y ~ x1*x2 |
Interaction between the variables and each individually; same as y ~ x1 + x2 + x1:x2 |
^ |
y ~ (x1, x2, x3)^3 |
Include variables and all interactions, up to 3-way interactions |
I() |
y ~ I(x1^2) |
Wrapper for transforming variables without having to create a new variable |
poly() |
y ~ poly(x1, 2) |
Creates polynomial terms up to the degree specified |
We can use I() to transform variables as part of the formula syntax. We can add character or factor variables directly, and R will automatically create dummy (indicator) variables for us.
r2 <- lm(mpg ~ hp + I(hp^2) + am + cyl, data=mtcars)
summary(r2)
##
## Call:
## lm(formula = mpg ~ hp + I(hp^2) + am + cyl, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5346 -1.4905 -0.1394 1.4281 5.8274
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.5193499 3.3413965 10.331 7.03e-11 ***
## hp -0.1241475 0.0500586 -2.480 0.0197 *
## I(hp^2) 0.0002111 0.0001163 1.815 0.0806 .
## am 3.3892736 1.2783733 2.651 0.0133 *
## cyl -0.5007040 0.7005650 -0.715 0.4809
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.698 on 27 degrees of freedom
## Multiple R-squared: 0.8254, Adjusted R-squared: 0.7996
## F-statistic: 31.92 on 4 and 27 DF, p-value: 7.088e-10
We added cylinders as a numeric variable above, but maybe it should be categorical? Also, a different way to specify the horsepower terms with poly
r3 <- lm(mpg ~ poly(hp, 2, raw=TRUE) + am + I(factor(cyl)), data=mtcars)
summary(r3)
##
## Call:
## lm(formula = mpg ~ poly(hp, 2, raw = TRUE) + am + I(factor(cyl)),
## data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9796 -1.6906 -0.2307 1.3885 5.3339
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.5082435 3.5362418 9.193 1.19e-09 ***
## poly(hp, 2, raw = TRUE)1 -0.1191294 0.0488505 -2.439 0.02188 *
## poly(hp, 2, raw = TRUE)2 0.0001835 0.0001146 1.602 0.12128
## am 3.6759120 1.2581308 2.922 0.00711 **
## I(factor(cyl))6 -2.6100522 1.7052349 -1.531 0.13794
## I(factor(cyl))8 -1.4860065 2.7485108 -0.541 0.59334
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.627 on 26 degrees of freedom
## Multiple R-squared: 0.8406, Adjusted R-squared: 0.81
## F-statistic: 27.42 on 5 and 26 DF, p-value: 1.364e-09
Note that it picked one group to be the base comparison group, and created variables for the other two. If, instead of seeing offsets from a base group, we want indicator variables for all groups, we can drop the automatically added intercept term:
r4 <- lm(mpg ~ -1 + poly(hp, 2, raw=TRUE) + am + I(factor(cyl)), data=mtcars)
summary(r4)
##
## Call:
## lm(formula = mpg ~ -1 + poly(hp, 2, raw = TRUE) + am + I(factor(cyl)),
## data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9796 -1.6906 -0.2307 1.3885 5.3339
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## poly(hp, 2, raw = TRUE)1 -0.1191294 0.0488505 -2.439 0.02188 *
## poly(hp, 2, raw = TRUE)2 0.0001835 0.0001146 1.602 0.12128
## am 3.6759120 1.2581308 2.922 0.00711 **
## I(factor(cyl))4 32.5082435 3.5362418 9.193 1.19e-09 ***
## I(factor(cyl))6 29.8981913 4.4839344 6.668 4.50e-07 ***
## I(factor(cyl))8 31.0222370 5.4240165 5.719 5.09e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.627 on 26 degrees of freedom
## Multiple R-squared: 0.9872, Adjusted R-squared: 0.9843
## F-statistic: 334.7 on 6 and 26 DF, p-value: < 2.2e-16
This is also useful if we need to constrain the fitted line to go through the origin.
We can plot fitted values for each variable:
termplot(r4, rug=TRUE)
If we want to predict new values from our fitted model, we can supply a data frame with column names matching the data we used to fit the model. Let’s see how changing hp affects the model. Hold transmission and cylinders at constant values.
newvals <- data.frame(hp=seq(50,250,10),
am=0,
cyl=4)
mpg_predict<- predict(r4, newvals)
mpg_predict
## 1 2 3 4 5 6 7 8
## 27.01060 26.02120 25.06849 24.15250 23.27321 22.43063 21.62475 20.85558
## 9 10 11 12 13 14 15 16
## 20.12312 19.42737 18.76832 18.14597 17.56034 17.01141 16.49918 16.02367
## 17 18 19 20 21
## 15.58486 15.18275 14.81736 14.48867 14.19668
Note that I don’t have to transform hp or cyl before calling predict.
plot(mpg_predict ~ newvals$hp,
type="l", lwd=3, col="red", # line style
main="am=0; cyl=4") # making note of other values
points(mtcars$mpg~mtcars$hp)