Linear regression for binary outcomes
names(spam)
## [1] "make" "address" "all"
## [4] "num3d" "our" "over"
## [7] "remove" "internet" "order"
## [10] "mail" "receive" "will"
## [13] "people" "report" "addresses"
## [16] "free" "business" "email"
## [19] "you" "credit" "your"
## [22] "font" "num000" "money"
## [25] "hp" "hpl" "george"
## [28] "num650" "lab" "labs"
## [31] "telnet" "num857" "data"
## [34] "num415" "num85" "technology"
## [37] "num1999" "parts" "pm"
## [40] "direct" "cs" "meeting"
## [43] "original" "project" "re"
## [46] "edu" "table" "conference"
## [49] "char_semicolon" "char_left_paren" "char_left_bracket"
## [52] "char_exclamation" "char_dollar" "char_pound"
## [55] "capital_avg" "capital_long" "capital_total"
## [58] "is_spam"
table(spam$is_spam)
head(spam$money)
## [1] 0.00 0.43 0.06 0.00 0.00 0.00
summary(spam$money)
model <- lm(is_spam ~ 1 + char_dollar + credit +
money + re, data = spam)
model
##
## Call:
## lm(formula = is_spam ~ 1 + char_dollar + credit + money + re,
## data = spam)
##
## Coefficients:
## (Intercept) char_dollar credit money re
## 0.33459 0.58551 0.15752 0.18794 -0.05355
pred <- predict(model)
head(pred)
## 1 2 3 4 5 6
## 0.3345915 0.5207985 0.5007950 0.3345915 0.3345915 0.3345915
summary(pred)
-0.8125 |
0.3346 |
0.3346 |
0.394 |
0.3954 |
3.849 |
Logistic regression
inv_logit <- function(x) {
exp(x)/(1 + exp(x))
}
inv_logit(0)
## [1] 0.5
Maximum likelihood estimation
model <- glm(is_spam ~ 1 + char_dollar + credit +
money + re, data = spam,
family = 'binomial')
model
##
## Call: glm(formula = is_spam ~ 1 + char_dollar + credit + money + re,
## family = "binomial", data = spam)
##
## Coefficients:
## (Intercept) char_dollar credit money re
## -1.0666 11.8176 2.3119 1.9933 -0.7755
##
## Degrees of Freedom: 4600 Total (i.e. Null); 4596 Residual
## Null Deviance: 6170
## Residual Deviance: 4428 AIC: 4438
Interpreting regression coefficients
coef(model)
## (Intercept) char_dollar credit money re
## -1.0665628 11.8175673 2.3118984 1.9932803 -0.7755045
exp(coef(model))
## (Intercept) char_dollar credit money re
## 3.441895e-01 1.356139e+05 1.009357e+01 7.339570e+00 4.604714e-01
Model predictions
pred <- predict(model)
head(pred)
## 1 2 3 4 5 6
## -1.066563 1.917710 1.920744 -1.066563 -1.066563 -1.066563
summary(pred)
-17.68 |
-1.067 |
-1.067 |
-0.01852 |
0.02937 |
69.87 |
p <- predict(model, type = 'response')
head(p)
## 1 2 3 4 5 6
## 0.2560573 0.8718828 0.8722213 0.2560573 0.2560573 0.2560573
summary(p)
0 |
0.2561 |
0.2561 |
0.394 |
0.5073 |
1 |
Model inspection & evaluation
Calibration
spam <- spam %>%
mutate(
prediction = predict(model, type='response'),
rounded_pred = plyr::round_any(prediction, .1)
)
spam %>%
select(prediction, rounded_pred) %>%
head
0.2560573 |
0.3 |
0.8718828 |
0.9 |
0.8722213 |
0.9 |
0.2560573 |
0.3 |
0.2560573 |
0.3 |
0.2560573 |
0.3 |
calibration <- spam %>%
group_by(rounded_pred) %>%
summarize(
freq = mean(is_spam),
pred = mean(prediction),
count = n())
head(calibration)
0.0 |
0.0250000 |
0.0214004 |
120 |
0.1 |
0.0732601 |
0.1088481 |
273 |
0.2 |
0.1863636 |
0.1935259 |
440 |
0.3 |
0.2248224 |
0.2598529 |
2393 |
0.4 |
0.5625000 |
0.4049039 |
160 |
0.5 |
0.7200000 |
0.5012132 |
125 |
p <- calibration %>%
ggplot(aes(pred, freq)) +
geom_point(aes(size = count), alpha=0.8) +
scale_size_area(max_size = 5, guide = FALSE) +
geom_abline(intercept = 0, slope = 1)
p
Accuracy
head(spam$prediction)
## 1 2 3 4 5 6
## 0.2560573 0.8718828 0.8722213 0.2560573 0.2560573 0.2560573
spam <- spam %>%
mutate(pred_type = prediction > 0.5)
head(spam$pred_type)
## 1 2 3 4 5 6
## FALSE TRUE TRUE FALSE FALSE FALSE
accuracy <- mean(spam$pred_type == spam$is_spam)
accuracy
## [1] 0.8072158
table(spam$is_spam)
mean(spam$is_spam == FALSE)
## [1] 0.6059552
Precision
pos <- spam %>%
filter(pred_type == TRUE)
precision <- mean(pos$is_spam)
precision
## [1] 0.9012132
Sensitivity
true <- spam %>%
filter(is_spam == TRUE)
recall <- mean(true$pred_type)
recall
## [1] 0.5736349
Specificity
false <- spam %>%
filter(is_spam == FALSE)
specificity <- mean(1 - false$pred_type)
specificity
## [1] 0.9591105
Selecting the threshold
spam <- spam %>%
mutate(pred_type_75 = prediction > .75)
pos75 <- spam %>%
filter(pred_type_75 == TRUE)
true <- spam %>%
filter(is_spam == TRUE)
false <- spam %>%
filter(is_spam == FALSE)
ROC curve
library(ROCR)
pred <- prediction(spam$prediction, spam$is_spam)
perf <- performance(pred, "tpr", "fpr")
plot(perf)
AUC
auc <- performance(pred, "auc")
unlist(slot(auc, "y.values"))
## [1] 0.8230104