Heights of fathers and sons

father_son <- read_tsv("father_son.tsv")
head(father_son)
## # A tibble: 6 × 2
##   Father   Son
##    <dbl> <dbl>
## 1   65.0  59.8
## 2   63.3  63.2
## 3   65.0  63.3
## 4   65.8  62.8
## 5   61.1  64.3
## 6   63.0  64.2

qplot(data = father_son, 
  x = Father, y = Son, geom = "point", size=I(0.25))

The standard deviation line

sd_line <- function(x, y) {
  slope <- sd(y)/sd(x)
  intercept <- mean(y) - mean(x) * slope
  geom_abline(slope = slope, intercept = intercept, 
      linetype = "dashed", color = "blue", size=0.8)
}

qplot(data = father_son, 
  x = Father, y = Son, geom = "point", size=I(0.25)) +
  sd_line(father_son$Father, father_son$Son)


measures <- read_tsv("heights_weights.tsv")
head(measures)
## # A tibble: 6 × 2
##   weight height
##    <int>  <int>
## 1    169     72
## 2    150     70
## 3    167     67
## 4    167     66
## 5    152     73
## 6    156     70

p <- qplot(data = measures, x = height, y = weight, 
        geom = "point", size=I(0.75)) +
  sd_line(measures$height, measures$weight)
p


p +
  geom_point(aes(x = mean(height) + sd(height), 
                 y = mean(weight) + sd(weight)),
             color = "blue", size = 2) +
  geom_point(aes(x = mean(height) - sd(height), 
                 y = mean(weight) - sd(weight)),
             color = "blue", size = 2)

Correlation

cor(x, y)
## [1] 0.3997

cor(x, y)
## [1] 0.8995

cor(x, y)
## [1] 0.01381

Correlation measures linear association

cor(x, y)
## [1] -0.00536

Regression

head(measures)
## # A tibble: 6 × 2
##   weight height
##    <int>  <int>
## 1    169     72
## 2    150     70
## 3    167     67
## 4    167     66
## 5    152     73
## 6    156     70

p <- qplot(data = measures, x = height, y = weight, 
           geom = "point", size=I(0.75)) +
  sd_line(measures$height, measures$weight)
p


q <- p +
  geom_point(aes(x = mean(height) + sd(height), 
                 y = mean(weight) + sd(weight)),
             color = "blue", size = 2) +
  geom_point(aes(x = mean(height) - sd(height), 
                 y = mean(weight) - sd(weight)),
             color = "blue", size = 2)
q

Regression line

tall <- measures %>%
  filter(height >= 72 & height <= 73)
avg_tall <- mean(tall$weight)

short <- measures %>%
  filter(height >= 67 & height <= 68)
avg_short <- mean(short$weight)

q <- q +
  geom_point(aes(x = mean(height) + sd(height)), 
             y = avg_tall, color = "red", size = 2) +
  geom_point(aes(x = mean(height) - sd(height)),  
             y = avg_short, color = "red", size = 2)
q


q <- q + geom_smooth(method = "lm", se = FALSE, 
                     color = "red")
q

Regression to the mean

avg_tall
## [1] 181.6
avg_tall - mean(measures$weight)
## [1] 14.64
sd(measures$weight)
## [1] 26.17

(avg_tall - mean(measures$weight)) / sd(measures$weight)
## [1] 0.5596
(avg_short - mean(measures$weight)) / sd(measures$weight)
## [1] -0.5076
cor(measures$height, measures$weight)
## [1] 0.5309

Regression line

measures <- measures %>%
  mutate(rounded_height = round_any(height, 2))
head(measures)
## # A tibble: 6 × 3
##   weight height rounded_height
##    <int>  <int>          <dbl>
## 1    169     72             72
## 2    150     70             70
## 3    167     67             68
## 4    167     66             66
## 5    152     73             72
## 6    156     70             70

avgs <- measures %>%
  group_by(rounded_height) %>%
  summarize(avg_weight = mean(weight))
head(avgs)
## # A tibble: 6 × 2
##   rounded_height avg_weight
##            <dbl>      <dbl>
## 1             64      136.5
## 2             66      144.0
## 3             68      154.9
## 4             70      162.5
## 5             72      182.4
## 6             74      190.0

q <- p + 
  geom_smooth(method = "lm", se = FALSE, 
                     color = "red") +
  geom_point(
    data = avgs,  
    aes(rounded_height, avg_weight),  
    color = "red", size = 2)
q



p +
  geom_smooth(method = "lm", se = FALSE, 
              color = "red")

cor(x, y)
## [1] -0.05958

ability <- rnorm(1000, 0, 1)
test_1 <- ability + rnorm(1000, 0, .5)
test_2 <- ability + rnorm(1000, 0, .5)

p <- qplot(test_1, test_2, size=I(0.25)) +
  sd_line(test_1, test_2)
p


p <- p +
  geom_smooth(method = "lm", se = FALSE, 
              color = "red")
p