Heights of fathers and sons
father_son <- read_tsv("father_son.tsv")
head(father_son)
## # A tibble: 6 × 2
## Father Son
## <dbl> <dbl>
## 1 65.0 59.8
## 2 63.3 63.2
## 3 65.0 63.3
## 4 65.8 62.8
## 5 61.1 64.3
## 6 63.0 64.2
qplot(data = father_son,
x = Father, y = Son, geom = "point", size=I(0.25))

The standard deviation line
sd_line <- function(x, y) {
slope <- sd(y)/sd(x)
intercept <- mean(y) - mean(x) * slope
geom_abline(slope = slope, intercept = intercept,
linetype = "dashed", color = "blue", size=0.8)
}
qplot(data = father_son,
x = Father, y = Son, geom = "point", size=I(0.25)) +
sd_line(father_son$Father, father_son$Son)

measures <- read_tsv("heights_weights.tsv")
head(measures)
## # A tibble: 6 × 2
## weight height
## <int> <int>
## 1 169 72
## 2 150 70
## 3 167 67
## 4 167 66
## 5 152 73
## 6 156 70
p <- qplot(data = measures, x = height, y = weight,
geom = "point", size=I(0.75)) +
sd_line(measures$height, measures$weight)
p

p +
geom_point(aes(x = mean(height) + sd(height),
y = mean(weight) + sd(weight)),
color = "blue", size = 2) +
geom_point(aes(x = mean(height) - sd(height),
y = mean(weight) - sd(weight)),
color = "blue", size = 2)

Correlation

cor(x, y)
## [1] 0.3997

cor(x, y)
## [1] 0.8995

cor(x, y)
## [1] 0.01381
Correlation measures linear association

cor(x, y)
## [1] -0.00536
Regression
head(measures)
## # A tibble: 6 × 2
## weight height
## <int> <int>
## 1 169 72
## 2 150 70
## 3 167 67
## 4 167 66
## 5 152 73
## 6 156 70
p <- qplot(data = measures, x = height, y = weight,
geom = "point", size=I(0.75)) +
sd_line(measures$height, measures$weight)
p

q <- p +
geom_point(aes(x = mean(height) + sd(height),
y = mean(weight) + sd(weight)),
color = "blue", size = 2) +
geom_point(aes(x = mean(height) - sd(height),
y = mean(weight) - sd(weight)),
color = "blue", size = 2)
q

Regression line
tall <- measures %>%
filter(height >= 72 & height <= 73)
avg_tall <- mean(tall$weight)
short <- measures %>%
filter(height >= 67 & height <= 68)
avg_short <- mean(short$weight)
q <- q +
geom_point(aes(x = mean(height) + sd(height)),
y = avg_tall, color = "red", size = 2) +
geom_point(aes(x = mean(height) - sd(height)),
y = avg_short, color = "red", size = 2)
q

q <- q + geom_smooth(method = "lm", se = FALSE,
color = "red")
q

Regression to the mean
avg_tall
## [1] 181.6
avg_tall - mean(measures$weight)
## [1] 14.64
sd(measures$weight)
## [1] 26.17
(avg_tall - mean(measures$weight)) / sd(measures$weight)
## [1] 0.5596
(avg_short - mean(measures$weight)) / sd(measures$weight)
## [1] -0.5076
cor(measures$height, measures$weight)
## [1] 0.5309
Regression line
measures <- measures %>%
mutate(rounded_height = round_any(height, 2))
head(measures)
## # A tibble: 6 × 3
## weight height rounded_height
## <int> <int> <dbl>
## 1 169 72 72
## 2 150 70 70
## 3 167 67 68
## 4 167 66 66
## 5 152 73 72
## 6 156 70 70
avgs <- measures %>%
group_by(rounded_height) %>%
summarize(avg_weight = mean(weight))
head(avgs)
## # A tibble: 6 × 2
## rounded_height avg_weight
## <dbl> <dbl>
## 1 64 136.5
## 2 66 144.0
## 3 68 154.9
## 4 70 162.5
## 5 72 182.4
## 6 74 190.0
q <- p +
geom_smooth(method = "lm", se = FALSE,
color = "red") +
geom_point(
data = avgs,
aes(rounded_height, avg_weight),
color = "red", size = 2)
q


p +
geom_smooth(method = "lm", se = FALSE,
color = "red")

cor(x, y)
## [1] -0.05958
ability <- rnorm(1000, 0, 1)
test_1 <- ability + rnorm(1000, 0, .5)
test_2 <- ability + rnorm(1000, 0, .5)
p <- qplot(test_1, test_2, size=I(0.25)) +
sd_line(test_1, test_2)
p

p <- p +
geom_smooth(method = "lm", se = FALSE,
color = "red")
p
