Linear regression
head(mtcars)
Mazda RX4 |
21.0 |
6 |
160 |
110 |
3.90 |
2.620 |
16.46 |
0 |
1 |
4 |
4 |
Mazda RX4 Wag |
21.0 |
6 |
160 |
110 |
3.90 |
2.875 |
17.02 |
0 |
1 |
4 |
4 |
Datsun 710 |
22.8 |
4 |
108 |
93 |
3.85 |
2.320 |
18.61 |
1 |
1 |
4 |
1 |
Hornet 4 Drive |
21.4 |
6 |
258 |
110 |
3.08 |
3.215 |
19.44 |
1 |
0 |
3 |
1 |
Hornet Sportabout |
18.7 |
8 |
360 |
175 |
3.15 |
3.440 |
17.02 |
0 |
0 |
3 |
2 |
Valiant |
18.1 |
6 |
225 |
105 |
2.76 |
3.460 |
20.22 |
1 |
0 |
3 |
1 |
model_1 <- lm(mpg ~ 1 + wt, data = mtcars)
model_1
##
## Call:
## lm(formula = mpg ~ 1 + wt, data = mtcars)
##
## Coefficients:
## (Intercept) wt
## 37.285 -5.344
summary(model_1)
##
## Call:
## lm(formula = mpg ~ 1 + wt, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5432 -2.3647 -0.1252 1.4096 6.8727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.2851 1.8776 19.858 < 2e-16 ***
## wt -5.3445 0.5591 -9.559 1.29e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared: 0.7528, Adjusted R-squared: 0.7446
## F-statistic: 91.38 on 1 and 30 DF, p-value: 1.294e-10
model_2 <- lm(mpg ~ 1 + wt + hp, data = mtcars)
model_2
##
## Call:
## lm(formula = mpg ~ 1 + wt + hp, data = mtcars)
##
## Coefficients:
## (Intercept) wt hp
## 37.22727 -3.87783 -0.03177
summary(model_2)
##
## Call:
## lm(formula = mpg ~ 1 + wt + hp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.941 -1.600 -0.182 1.050 5.854
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.22727 1.59879 23.285 < 2e-16 ***
## wt -3.87783 0.63273 -6.129 1.12e-06 ***
## hp -0.03177 0.00903 -3.519 0.00145 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.593 on 29 degrees of freedom
## Multiple R-squared: 0.8268, Adjusted R-squared: 0.8148
## F-statistic: 69.21 on 2 and 29 DF, p-value: 9.109e-12
preds <- predict(model_2, mtcars)
head(preds)
## Mazda RX4 Mazda RX4 Wag Datsun 710
## 23.57233 22.58348 25.27582
## Hornet 4 Drive Hornet Sportabout Valiant
## 21.26502 18.32727 20.47382
r <- cor(mtcars$mpg, preds)
r^2
## [1] 0.8267855
Feature construction
p <- qplot(data = mtcars, wt, mpg) +
geom_smooth(formula = y ~ 1 + x,
method = "lm")
p
Model with quadratic term
p <- qplot(data = mtcars, wt, mpg) +
geom_smooth(formula = y ~ 1 + x + I(x^2),
method = "lm")
p
##
## Call:
## lm(formula = mpg ~ 1 + wt, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5432 -2.3647 -0.1252 1.4096 6.8727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.2851 1.8776 19.858 < 2e-16 ***
## wt -5.3445 0.5591 -9.559 1.29e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared: 0.7528, Adjusted R-squared: 0.7446
## F-statistic: 91.38 on 1 and 30 DF, p-value: 1.294e-10
##
## Call:
## lm(formula = mpg ~ 1 + wt + I(wt^2), data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.483 -1.998 -0.773 1.462 6.238
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.9308 4.2113 11.856 1.21e-12 ***
## wt -13.3803 2.5140 -5.322 1.04e-05 ***
## I(wt^2) 1.1711 0.3594 3.258 0.00286 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.651 on 29 degrees of freedom
## Multiple R-squared: 0.8191, Adjusted R-squared: 0.8066
## F-statistic: 65.64 on 2 and 29 DF, p-value: 1.715e-11
##
## Call:
## lm(formula = mpg ~ 1 + wt + I(wt^2) + hp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9799 -1.2543 -0.7521 1.2494 5.4202
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.837283 3.659017 13.074 1.92e-13 ***
## wt -10.822173 2.281031 -4.744 5.58e-05 ***
## I(wt^2) 0.981811 0.312848 3.138 0.00398 **
## hp -0.027283 0.008032 -3.397 0.00206 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.27 on 28 degrees of freedom
## Multiple R-squared: 0.8719, Adjusted R-squared: 0.8581
## F-statistic: 63.5 on 3 and 28 DF, p-value: 1.309e-12
##
## Call:
## lm(formula = mpg ~ 1 + wt + I(wt^2) + hp + I(hp^2), data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8849 -1.8165 -0.3922 1.3499 4.5807
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.945e+01 3.521e+00 14.044 6.27e-14 ***
## wt -9.220e+00 2.270e+00 -4.062 0.000375 ***
## I(wt^2) 8.500e-01 3.005e-01 2.829 0.008700 **
## hp -9.428e-02 3.193e-02 -2.952 0.006456 **
## I(hp^2) 1.743e-04 8.073e-05 2.159 0.039879 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.135 on 27 degrees of freedom
## Multiple R-squared: 0.8907, Adjusted R-squared: 0.8745
## F-statistic: 55.02 on 4 and 27 DF, p-value: 1.363e-12
Heights and weights
head(heights_weights)
M |
77 |
182 |
77 |
180 |
F |
58 |
161 |
51 |
159 |
F |
53 |
161 |
54 |
158 |
M |
68 |
177 |
70 |
175 |
F |
59 |
157 |
59 |
155 |
M |
76 |
170 |
76 |
165 |
qplot(data = heights_weights, height, weight) +
geom_smooth(method = "lm", formula = y ~ 1 + x)
##
## Call:
## lm(formula = weight ~ 1 + height, data = heights_weights)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.9000 -5.2946 -0.5476 5.0698 21.9765
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -126.57554 10.84059 -11.68 <2e-16 ***
## height 1.12349 0.06348 17.70 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.971 on 196 degrees of freedom
## Multiple R-squared: 0.6151, Adjusted R-squared: 0.6131
## F-statistic: 313.2 on 1 and 196 DF, p-value: < 2.2e-16
qplot(data = heights_weights, height, weight, color = sex)
Model with multiple intercepts
##
## Call:
## lm(formula = weight ~ -1 + sex + height, data = heights_weights)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.6077 -4.5014 -0.4761 5.1626 21.9951
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## sexF -74.71594 14.68699 -5.087 8.51e-07 ***
## sexM -66.82173 15.87346 -4.210 3.90e-05 ***
## height 0.79906 0.08907 8.971 2.44e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.536 on 195 degrees of freedom
## Multiple R-squared: 0.9873, Adjusted R-squared: 0.9871
## F-statistic: 5039 on 3 and 195 DF, p-value: < 2.2e-16
Problems with identifiability
##
## Call:
## lm(formula = weight ~ 1 + sex + height, data = heights_weights)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.6077 -4.5014 -0.4761 5.1626 21.9951
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -74.71594 14.68699 -5.087 8.51e-07 ***
## sexM 7.89421 1.60142 4.930 1.76e-06 ***
## height 0.79906 0.08907 8.971 2.44e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.536 on 195 degrees of freedom
## Multiple R-squared: 0.6578, Adjusted R-squared: 0.6543
## F-statistic: 187.4 on 2 and 195 DF, p-value: < 2.2e-16
Model with multiple intercepts and slopes
##
## Call:
## lm(formula = weight ~ -1 + sex + sex:height, data = heights_weights)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.3041 -4.3998 -0.6509 4.7883 20.7780
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## sexF -45.7084 20.6798 -2.210 0.0283 *
## sexM -97.7187 22.1883 -4.404 1.75e-05 ***
## sexF:height 0.6229 0.1255 4.964 1.51e-06 ***
## sexM:height 0.9727 0.1246 7.807 3.56e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.48 on 194 degrees of freedom
## Multiple R-squared: 0.9875, Adjusted R-squared: 0.9873
## F-statistic: 3837 on 4 and 194 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = weight ~ 1 + sex * height, data = heights_weights)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.3041 -4.3998 -0.6509 4.7883 20.7780
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -45.7084 20.6798 -2.210 0.0283 *
## sexM -52.0103 30.3311 -1.715 0.0880 .
## height 0.6229 0.1255 4.964 1.51e-06 ***
## sexM:height 0.3497 0.1768 1.978 0.0494 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.48 on 194 degrees of freedom
## Multiple R-squared: 0.6645, Adjusted R-squared: 0.6593
## F-statistic: 128.1 on 3 and 194 DF, p-value: < 2.2e-16