Heights of fathers and sons

father_son <- read_tsv("father_son.tsv")
head(father_son)
## # A tibble: 6 × 2
##   Father   Son
##    <dbl> <dbl>
## 1   65.0  59.8
## 2   63.3  63.2
## 3   65.0  63.3
## 4   65.8  62.8
## 5   61.1  64.3
## 6   63.0  64.2

p <- qplot(data = father_son, x = Father, y = Son, 
           geom = "point", size=I(0.25)) +
  geom_smooth(method = "lm", se = FALSE, color = "red")
p


Simple linear regression

model <- lm(Son ~ 1 + Father, data = father_son)
model
## 
## Call:
## lm(formula = Son ~ 1 + Father, data = father_son)
## 
## Coefficients:
## (Intercept)       Father  
##      33.893        0.514

newdata <- as_tibble(data.frame(
  Father = c(72, 64, 70)
))
predict(model, newdata)
##        1        2        3 
## 70.90123 66.78918 69.87321

## # A tibble: 1 × 2
##   Father   Son
##    <dbl> <dbl>
## 1   71.8  72.6


## # A tibble: 1 × 3
##   Father   Son     pred
##    <dbl> <dbl>    <dbl>
## 1   71.8  72.6 70.79843


father_son <- father_son %>%
  mutate(pred = predict(model, .),
         residual = Son - pred)
head(father_son)
## # A tibble: 6 × 4
##   Father   Son     pred   residual
##    <dbl> <dbl>    <dbl>      <dbl>
## 1   65.0  59.8 67.30318 -7.5031849
## 2   63.3  63.2 66.42937 -3.2293748
## 3   65.0  63.3 67.30318 -4.0031849
## 4   65.8  62.8 67.71439 -4.9143896
## 5   61.1  64.3 65.29856 -0.9985618
## 6   63.0  64.2 66.27517 -2.0751730

model
## 
## Call:
## lm(formula = Son ~ 1 + Father, data = father_son)
## 
## Coefficients:
## (Intercept)       Father  
##      33.893        0.514
r <- cor(father_son$Father, father_son$Son)
sd_x <- sd(father_son$Father)
sd_y <- sd(father_son$Son)

beta_1 <- r * sd_y / sd_x
beta_1
## [1] 0.5140059

model
## 
## Call:
## lm(formula = Son ~ 1 + Father, data = father_son)
## 
## Coefficients:
## (Intercept)       Father  
##      33.893        0.514
mu_x <- mean(father_son$Father)
mu_y <- mean(father_son$Son)

beta_0 <- mu_y - beta_1 * mu_x
beta_0
## [1] 33.8928

qplot(data = father_son, x = residual, 
      geom = "histogram")


summary(model)

Call:
lm(formula = Son ~ 1 + Father, data = father_son)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.8910 -1.5361 -0.0092  1.6359  8.9894 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 33.89280    1.83289   18.49   <2e-16 ***
Father       0.51401    0.02706   19.00   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.438 on 1076 degrees of freedom
Multiple R-squared:  0.2512,    Adjusted R-squared:  0.2505 
F-statistic: 360.9 on 1 and 1076 DF,  p-value: < 2.2e-16