Setup

library(tidyverse)

Data

We’ll use a built in data set, mtcars

data(mtcars)
head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

Explore Data

First Steps

We’ve covered some simple measures before, like mean and summary:

mean(mtcars$mpg)
## [1] 20.09062
summary(mtcars$mpg)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.40   15.42   19.20   20.09   22.80   33.90

And we’ve made frequency tables of values:

table(mtcars$gear)
## 
##  3  4  5 
## 15 12  5

Although we can go beyond 1 variable to 2 or 3:

table(mtcars$gear, mtcars$am) # rows, cols
##    
##      0  1
##   3 15  0
##   4  4  8
##   5  0  5

ftable makes output for 3 dimension tables a bit easier to read

ftable(mtcars$gear, mtcars$am, mtcars$cyl) # rows, subrows, columns
##       4  6  8
##              
## 3 0   1  2 12
##   1   0  0  0
## 4 0   2  2  0
##   1   6  2  0
## 5 0   0  0  0
##   1   2  1  2

or proportion tables:

prop.table(table(mtcars$gear))
## 
##       3       4       5 
## 0.46875 0.37500 0.15625

And we’ve learned we can group our data and summarize by group too

mtcars %>%
  group_by(am) %>%
  summarize(mean(mpg),
            mean(hp)) 
## # A tibble: 2 x 3
##      am `mean(mpg)` `mean(hp)`
##   <dbl>       <dbl>      <dbl>
## 1     0    17.14737   160.2632
## 2     1    24.39231   126.8462

What else can we do?

Correlation

Correlation between two variables

cor(mtcars$mpg, mtcars$disp)
## [1] -0.8475514

Or all pairwise comparisons:

cor(mtcars)
##             mpg        cyl       disp         hp        drat         wt
## mpg   1.0000000 -0.8521620 -0.8475514 -0.7761684  0.68117191 -0.8676594
## cyl  -0.8521620  1.0000000  0.9020329  0.8324475 -0.69993811  0.7824958
## disp -0.8475514  0.9020329  1.0000000  0.7909486 -0.71021393  0.8879799
## hp   -0.7761684  0.8324475  0.7909486  1.0000000 -0.44875912  0.6587479
## drat  0.6811719 -0.6999381 -0.7102139 -0.4487591  1.00000000 -0.7124406
## wt   -0.8676594  0.7824958  0.8879799  0.6587479 -0.71244065  1.0000000
## qsec  0.4186840 -0.5912421 -0.4336979 -0.7082234  0.09120476 -0.1747159
## vs    0.6640389 -0.8108118 -0.7104159 -0.7230967  0.44027846 -0.5549157
## am    0.5998324 -0.5226070 -0.5912270 -0.2432043  0.71271113 -0.6924953
## gear  0.4802848 -0.4926866 -0.5555692 -0.1257043  0.69961013 -0.5832870
## carb -0.5509251  0.5269883  0.3949769  0.7498125 -0.09078980  0.4276059
##             qsec         vs          am       gear        carb
## mpg   0.41868403  0.6640389  0.59983243  0.4802848 -0.55092507
## cyl  -0.59124207 -0.8108118 -0.52260705 -0.4926866  0.52698829
## disp -0.43369788 -0.7104159 -0.59122704 -0.5555692  0.39497686
## hp   -0.70822339 -0.7230967 -0.24320426 -0.1257043  0.74981247
## drat  0.09120476  0.4402785  0.71271113  0.6996101 -0.09078980
## wt   -0.17471588 -0.5549157 -0.69249526 -0.5832870  0.42760594
## qsec  1.00000000  0.7445354 -0.22986086 -0.2126822 -0.65624923
## vs    0.74453544  1.0000000  0.16834512  0.2060233 -0.56960714
## am   -0.22986086  0.1683451  1.00000000  0.7940588  0.05753435
## gear -0.21268223  0.2060233  0.79405876  1.0000000  0.27407284
## carb -0.65624923 -0.5696071  0.05753435  0.2740728  1.00000000

Alternatively, covariance

cov(mtcars)
##              mpg         cyl        disp          hp         drat
## mpg    36.324103  -9.1723790  -633.09721 -320.732056   2.19506351
## cyl    -9.172379   3.1895161   199.66028  101.931452  -0.66836694
## disp -633.097208 199.6602823 15360.79983 6721.158669 -47.06401915
## hp   -320.732056 101.9314516  6721.15867 4700.866935 -16.45110887
## drat    2.195064  -0.6683669   -47.06402  -16.451109   0.28588135
## wt     -5.116685   1.3673710   107.68420   44.192661  -0.37272073
## qsec    4.509149  -1.8868548   -96.05168  -86.770081   0.08714073
## vs      2.017137  -0.7298387   -44.37762  -24.987903   0.11864919
## am      1.803931  -0.4657258   -36.56401   -8.320565   0.19015121
## gear    2.135685  -0.6491935   -50.80262   -6.358871   0.27598790
## carb   -5.363105   1.5201613    79.06875   83.036290  -0.07840726
##               wt         qsec           vs           am        gear
## mpg   -5.1166847   4.50914919   2.01713710   1.80393145   2.1356855
## cyl    1.3673710  -1.88685484  -0.72983871  -0.46572581  -0.6491935
## disp 107.6842040 -96.05168145 -44.37762097 -36.56401210 -50.8026210
## hp    44.1926613 -86.77008065 -24.98790323  -8.32056452  -6.3588710
## drat  -0.3727207   0.08714073   0.11864919   0.19015121   0.2759879
## wt     0.9573790  -0.30548161  -0.27366129  -0.33810484  -0.4210806
## qsec  -0.3054816   3.19316613   0.67056452  -0.20495968  -0.2804032
## vs    -0.2736613   0.67056452   0.25403226   0.04233871   0.0766129
## am    -0.3381048  -0.20495968   0.04233871   0.24899194   0.2923387
## gear  -0.4210806  -0.28040323   0.07661290   0.29233871   0.5443548
## carb   0.6757903  -1.89411290  -0.46370968   0.04637097   0.3266129
##             carb
## mpg  -5.36310484
## cyl   1.52016129
## disp 79.06875000
## hp   83.03629032
## drat -0.07840726
## wt    0.67579032
## qsec -1.89411290
## vs   -0.46370968
## am    0.04637097
## gear  0.32661290
## carb  2.60887097

Testing Differences

T-test

Do manual and automatic transmission cars have different mean miles per gallon?

t.test(mtcars$mpg~mtcars$am)
## 
##  Welch Two Sample t-test
## 
## data:  mtcars$mpg by mtcars$am
## t = -3.7671, df = 18.332, p-value = 0.001374
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -11.280194  -3.209684
## sample estimates:
## mean in group 0 mean in group 1 
##        17.14737        24.39231

Think of ~ as “as a function of”. So above, MPG as a function of transmission, meaning grouped by or explained by.

We don’t have to use the formula syntax. We can send data for two different groups:

t.test(mtcars$mpg[mtcars$am==0], mtcars$mpg[mtcars$am==1])

In addition to the printout, you can get the various parts of the results

t1 <- t.test(mtcars$mpg~mtcars$am)
names(t1)
## [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
## [6] "null.value"  "alternative" "method"      "data.name"
t1$statistic
##         t 
## -3.767123

Regression

Let’s predict MPG using a linear model. First, a simple bivariate regression – just one explanatory variable.

Basic syntax is

lm(y ~ x1 + x2 + x3, data=df)
lm(mpg ~ hp, data=mtcars)
## 
## Call:
## lm(formula = mpg ~ hp, data = mtcars)
## 
## Coefficients:
## (Intercept)           hp  
##    30.09886     -0.06823

Default output isn’t much. You get a lot more with summary:

r1 <- lm(mpg ~ hp, data=mtcars)
summary(r1)
## 
## Call:
## lm(formula = mpg ~ hp, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.7121 -2.1122 -0.8854  1.5819  8.2360 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30.09886    1.63392  18.421  < 2e-16 ***
## hp          -0.06823    0.01012  -6.742 1.79e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.863 on 30 degrees of freedom
## Multiple R-squared:  0.6024, Adjusted R-squared:  0.5892 
## F-statistic: 45.46 on 1 and 30 DF,  p-value: 1.788e-07

Note that a constant (Intercept) term was added automatically.

As above, you can pull out the individual components as needed

names(r1)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"

Visualizing the result

plot(mtcars$mpg ~ mtcars$hp)
abline(r1)

Label points with

identify(mtcars$hp, mtcars$mpg, 
         labels=rownames(mtcars))

And visualizing diagnostics:

plot(r1)

Additional terms

How do we add more variables? What about transforming variables? Here is some of the formula syntax:

Symbol Example Description
~ y ~ x1 Defines the formula (necessary to create a formula object)
+ y ~ x1 + x2 Include the variable
- y ~ -1 +x1 Delete a term, usually a 1 for the intercept
: y ~ x1 + x1:x2 Interaction term
* y ~ x1*x2 Interaction between the variables and each individually; same as y ~ x1 + x2 + x1:x2
^ y ~ (x1, x2, x3)^3 Include variables and all interactions, up to 3-way interactions
I() y ~ I(x1^2) Wrapper for transforming variables without having to create a new variable
poly() y ~ poly(x1, 2) Creates polynomial terms up to the degree specified

We can use I() to transform variables as part of the formula syntax. We can add character or factor variables directly, and R will automatically create dummy (indicator) variables for us.

r2 <- lm(mpg ~ hp + I(hp^2) + am + cyl, data=mtcars)
summary(r2)
## 
## Call:
## lm(formula = mpg ~ hp + I(hp^2) + am + cyl, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5346 -1.4905 -0.1394  1.4281  5.8274 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.5193499  3.3413965  10.331 7.03e-11 ***
## hp          -0.1241475  0.0500586  -2.480   0.0197 *  
## I(hp^2)      0.0002111  0.0001163   1.815   0.0806 .  
## am           3.3892736  1.2783733   2.651   0.0133 *  
## cyl         -0.5007040  0.7005650  -0.715   0.4809    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.698 on 27 degrees of freedom
## Multiple R-squared:  0.8254, Adjusted R-squared:  0.7996 
## F-statistic: 31.92 on 4 and 27 DF,  p-value: 7.088e-10

We added cylinders as a numeric variable above, but maybe it should be categorical? Also, a different way to specify the horsepower terms with poly

r3 <- lm(mpg ~ poly(hp, 2, raw=TRUE) + am + I(factor(cyl)), data=mtcars)
summary(r3)
## 
## Call:
## lm(formula = mpg ~ poly(hp, 2, raw = TRUE) + am + I(factor(cyl)), 
##     data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9796 -1.6906 -0.2307  1.3885  5.3339 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              32.5082435  3.5362418   9.193 1.19e-09 ***
## poly(hp, 2, raw = TRUE)1 -0.1191294  0.0488505  -2.439  0.02188 *  
## poly(hp, 2, raw = TRUE)2  0.0001835  0.0001146   1.602  0.12128    
## am                        3.6759120  1.2581308   2.922  0.00711 ** 
## I(factor(cyl))6          -2.6100522  1.7052349  -1.531  0.13794    
## I(factor(cyl))8          -1.4860065  2.7485108  -0.541  0.59334    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.627 on 26 degrees of freedom
## Multiple R-squared:  0.8406, Adjusted R-squared:   0.81 
## F-statistic: 27.42 on 5 and 26 DF,  p-value: 1.364e-09

Note that it picked one group to be the base comparison group, and created variables for the other two. If, instead of seeing offsets from a base group, we want indicator variables for all groups, we can drop the automatically added intercept term:

r4 <- lm(mpg ~ -1 + poly(hp, 2, raw=TRUE) + am + I(factor(cyl)), data=mtcars)
summary(r4)
## 
## Call:
## lm(formula = mpg ~ -1 + poly(hp, 2, raw = TRUE) + am + I(factor(cyl)), 
##     data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9796 -1.6906 -0.2307  1.3885  5.3339 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## poly(hp, 2, raw = TRUE)1 -0.1191294  0.0488505  -2.439  0.02188 *  
## poly(hp, 2, raw = TRUE)2  0.0001835  0.0001146   1.602  0.12128    
## am                        3.6759120  1.2581308   2.922  0.00711 ** 
## I(factor(cyl))4          32.5082435  3.5362418   9.193 1.19e-09 ***
## I(factor(cyl))6          29.8981913  4.4839344   6.668 4.50e-07 ***
## I(factor(cyl))8          31.0222370  5.4240165   5.719 5.09e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.627 on 26 degrees of freedom
## Multiple R-squared:  0.9872, Adjusted R-squared:  0.9843 
## F-statistic: 334.7 on 6 and 26 DF,  p-value: < 2.2e-16

This is also useful if we need to constrain the fitted line to go through the origin.

We can plot fitted values for each variable:

termplot(r4, rug=TRUE)

More on termplot

Predicting values

If we want to predict new values from our fitted model, we can supply a data frame with column names matching the data we used to fit the model. Let’s see how changing hp affects the model. Hold transmission and cylinders at constant values.

newvals <- data.frame(hp=seq(50,250,10),
                      am=0,
                      cyl=4)
mpg_predict<- predict(r4, newvals)
mpg_predict
##        1        2        3        4        5        6        7        8 
## 27.01060 26.02120 25.06849 24.15250 23.27321 22.43063 21.62475 20.85558 
##        9       10       11       12       13       14       15       16 
## 20.12312 19.42737 18.76832 18.14597 17.56034 17.01141 16.49918 16.02367 
##       17       18       19       20       21 
## 15.58486 15.18275 14.81736 14.48867 14.19668

Note that I don’t have to transform hp or cyl before calling predict.

plot(mpg_predict ~ newvals$hp, 
     type="l", lwd=3, col="red", # line style
     main="am=0; cyl=4") # making note of other values
points(mtcars$mpg~mtcars$hp)