Linear regression

head(mtcars)
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1

model_1 <- lm(mpg ~ 1 + wt, data = mtcars)
model_1
## 
## Call:
## lm(formula = mpg ~ 1 + wt, data = mtcars)
## 
## Coefficients:
## (Intercept)           wt  
##      37.285       -5.344

summary(model_1)
## 
## Call:
## lm(formula = mpg ~ 1 + wt, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5432 -2.3647 -0.1252  1.4096  6.8727 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.2851     1.8776  19.858  < 2e-16 ***
## wt           -5.3445     0.5591  -9.559 1.29e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared:  0.7528, Adjusted R-squared:  0.7446 
## F-statistic: 91.38 on 1 and 30 DF,  p-value: 1.294e-10

model_2 <- lm(mpg ~ 1 + wt + hp, data = mtcars)
model_2
## 
## Call:
## lm(formula = mpg ~ 1 + wt + hp, data = mtcars)
## 
## Coefficients:
## (Intercept)           wt           hp  
##    37.22727     -3.87783     -0.03177

summary(model_2)
## 
## Call:
## lm(formula = mpg ~ 1 + wt + hp, data = mtcars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.941 -1.600 -0.182  1.050  5.854 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 37.22727    1.59879  23.285  < 2e-16 ***
## wt          -3.87783    0.63273  -6.129 1.12e-06 ***
## hp          -0.03177    0.00903  -3.519  0.00145 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.593 on 29 degrees of freedom
## Multiple R-squared:  0.8268, Adjusted R-squared:  0.8148 
## F-statistic: 69.21 on 2 and 29 DF,  p-value: 9.109e-12

preds <- predict(model_2, mtcars)
head(preds)
##         Mazda RX4     Mazda RX4 Wag        Datsun 710 
##          23.57233          22.58348          25.27582 
##    Hornet 4 Drive Hornet Sportabout           Valiant 
##          21.26502          18.32727          20.47382
r <- cor(mtcars$mpg, preds)
r^2
## [1] 0.8267855

Feature construction

p <- qplot(data = mtcars, wt, mpg) +
  geom_smooth(formula = y ~ 1 + x,
              method = "lm")
p

Model with quadratic term

p <- qplot(data = mtcars, wt, mpg) +
  geom_smooth(formula = y ~ 1 + x + I(x^2),
              method = "lm")
p


## 
## Call:
## lm(formula = mpg ~ 1 + wt, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5432 -2.3647 -0.1252  1.4096  6.8727 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.2851     1.8776  19.858  < 2e-16 ***
## wt           -5.3445     0.5591  -9.559 1.29e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared:  0.7528, Adjusted R-squared:  0.7446 
## F-statistic: 91.38 on 1 and 30 DF,  p-value: 1.294e-10

## 
## Call:
## lm(formula = mpg ~ 1 + wt + I(wt^2), data = mtcars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.483 -1.998 -0.773  1.462  6.238 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  49.9308     4.2113  11.856 1.21e-12 ***
## wt          -13.3803     2.5140  -5.322 1.04e-05 ***
## I(wt^2)       1.1711     0.3594   3.258  0.00286 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.651 on 29 degrees of freedom
## Multiple R-squared:  0.8191, Adjusted R-squared:  0.8066 
## F-statistic: 65.64 on 2 and 29 DF,  p-value: 1.715e-11

## 
## Call:
## lm(formula = mpg ~ 1 + wt + I(wt^2) + hp, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9799 -1.2543 -0.7521  1.2494  5.4202 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  47.837283   3.659017  13.074 1.92e-13 ***
## wt          -10.822173   2.281031  -4.744 5.58e-05 ***
## I(wt^2)       0.981811   0.312848   3.138  0.00398 ** 
## hp           -0.027283   0.008032  -3.397  0.00206 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.27 on 28 degrees of freedom
## Multiple R-squared:  0.8719, Adjusted R-squared:  0.8581 
## F-statistic:  63.5 on 3 and 28 DF,  p-value: 1.309e-12

## 
## Call:
## lm(formula = mpg ~ 1 + wt + I(wt^2) + hp + I(hp^2), data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8849 -1.8165 -0.3922  1.3499  4.5807 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.945e+01  3.521e+00  14.044 6.27e-14 ***
## wt          -9.220e+00  2.270e+00  -4.062 0.000375 ***
## I(wt^2)      8.500e-01  3.005e-01   2.829 0.008700 ** 
## hp          -9.428e-02  3.193e-02  -2.952 0.006456 ** 
## I(hp^2)      1.743e-04  8.073e-05   2.159 0.039879 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.135 on 27 degrees of freedom
## Multiple R-squared:  0.8907, Adjusted R-squared:  0.8745 
## F-statistic: 55.02 on 4 and 27 DF,  p-value: 1.363e-12

Heights and weights

head(heights_weights)
sex weight height repwt repht
M 77 182 77 180
F 58 161 51 159
F 53 161 54 158
M 68 177 70 175
F 59 157 59 155
M 76 170 76 165

qplot(data = heights_weights, height, weight) +
  geom_smooth(method = "lm", formula = y ~ 1 + x)


## 
## Call:
## lm(formula = weight ~ 1 + height, data = heights_weights)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.9000  -5.2946  -0.5476   5.0698  21.9765 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -126.57554   10.84059  -11.68   <2e-16 ***
## height         1.12349    0.06348   17.70   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.971 on 196 degrees of freedom
## Multiple R-squared:  0.6151, Adjusted R-squared:  0.6131 
## F-statistic: 313.2 on 1 and 196 DF,  p-value: < 2.2e-16

qplot(data = heights_weights, height, weight, color = sex)

Model with multiple intercepts

## 
## Call:
## lm(formula = weight ~ -1 + sex + height, data = heights_weights)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.6077  -4.5014  -0.4761   5.1626  21.9951 
## 
## Coefficients:
##         Estimate Std. Error t value Pr(>|t|)    
## sexF   -74.71594   14.68699  -5.087 8.51e-07 ***
## sexM   -66.82173   15.87346  -4.210 3.90e-05 ***
## height   0.79906    0.08907   8.971 2.44e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.536 on 195 degrees of freedom
## Multiple R-squared:  0.9873, Adjusted R-squared:  0.9871 
## F-statistic:  5039 on 3 and 195 DF,  p-value: < 2.2e-16

Problems with identifiability

## 
## Call:
## lm(formula = weight ~ 1 + sex + height, data = heights_weights)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.6077  -4.5014  -0.4761   5.1626  21.9951 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -74.71594   14.68699  -5.087 8.51e-07 ***
## sexM          7.89421    1.60142   4.930 1.76e-06 ***
## height        0.79906    0.08907   8.971 2.44e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.536 on 195 degrees of freedom
## Multiple R-squared:  0.6578, Adjusted R-squared:  0.6543 
## F-statistic: 187.4 on 2 and 195 DF,  p-value: < 2.2e-16

Model with multiple intercepts and slopes

## 
## Call:
## lm(formula = weight ~ -1 + sex + sex:height, data = heights_weights)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.3041  -4.3998  -0.6509   4.7883  20.7780 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## sexF        -45.7084    20.6798  -2.210   0.0283 *  
## sexM        -97.7187    22.1883  -4.404 1.75e-05 ***
## sexF:height   0.6229     0.1255   4.964 1.51e-06 ***
## sexM:height   0.9727     0.1246   7.807 3.56e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.48 on 194 degrees of freedom
## Multiple R-squared:  0.9875, Adjusted R-squared:  0.9873 
## F-statistic:  3837 on 4 and 194 DF,  p-value: < 2.2e-16
## 
## Call:
## lm(formula = weight ~ 1 + sex * height, data = heights_weights)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.3041  -4.3998  -0.6509   4.7883  20.7780 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -45.7084    20.6798  -2.210   0.0283 *  
## sexM        -52.0103    30.3311  -1.715   0.0880 .  
## height        0.6229     0.1255   4.964 1.51e-06 ***
## sexM:height   0.3497     0.1768   1.978   0.0494 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.48 on 194 degrees of freedom
## Multiple R-squared:  0.6645, Adjusted R-squared:  0.6593 
## F-statistic: 128.1 on 3 and 194 DF,  p-value: < 2.2e-16