Linear regression for binary outcomes

names(spam) 
##  [1] "make"              "address"           "all"              
##  [4] "num3d"             "our"               "over"             
##  [7] "remove"            "internet"          "order"            
## [10] "mail"              "receive"           "will"             
## [13] "people"            "report"            "addresses"        
## [16] "free"              "business"          "email"            
## [19] "you"               "credit"            "your"             
## [22] "font"              "num000"            "money"            
## [25] "hp"                "hpl"               "george"           
## [28] "num650"            "lab"               "labs"             
## [31] "telnet"            "num857"            "data"             
## [34] "num415"            "num85"             "technology"       
## [37] "num1999"           "parts"             "pm"               
## [40] "direct"            "cs"                "meeting"          
## [43] "original"          "project"           "re"               
## [46] "edu"               "table"             "conference"       
## [49] "char_semicolon"    "char_left_paren"   "char_left_bracket"
## [52] "char_exclamation"  "char_dollar"       "char_pound"       
## [55] "capital_avg"       "capital_long"      "capital_total"    
## [58] "is_spam"

table(spam$is_spam)
0 1
2788 1813
head(spam$money)
## [1] 0.00 0.43 0.06 0.00 0.00 0.00
summary(spam$money)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0 0 0 0.09427 0 12.5

model <- lm(is_spam ~ 1 + char_dollar + credit +
              money + re, data = spam)
model
## 
## Call:
## lm(formula = is_spam ~ 1 + char_dollar + credit + money + re, 
##     data = spam)
## 
## Coefficients:
## (Intercept)  char_dollar       credit        money           re  
##     0.33459      0.58551      0.15752      0.18794     -0.05355

pred <- predict(model)
head(pred)
##         1         2         3         4         5         6 
## 0.3345915 0.5207985 0.5007950 0.3345915 0.3345915 0.3345915
summary(pred)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.8125 0.3346 0.3346 0.394 0.3954 3.849

Logistic regression

inv_logit <- function(x) {
  exp(x)/(1 + exp(x))
}

inv_logit(0)
## [1] 0.5

Maximum likelihood estimation

model <- glm(is_spam ~ 1 + char_dollar + credit +  
               money + re, data = spam, 
             family = 'binomial')
model
## 
## Call:  glm(formula = is_spam ~ 1 + char_dollar + credit + money + re, 
##     family = "binomial", data = spam)
## 
## Coefficients:
## (Intercept)  char_dollar       credit        money           re  
##     -1.0666      11.8176       2.3119       1.9933      -0.7755  
## 
## Degrees of Freedom: 4600 Total (i.e. Null);  4596 Residual
## Null Deviance:       6170 
## Residual Deviance: 4428  AIC: 4438

Interpreting regression coefficients

coef(model)
## (Intercept) char_dollar      credit       money          re 
##  -1.0665628  11.8175673   2.3118984   1.9932803  -0.7755045
exp(coef(model))
##  (Intercept)  char_dollar       credit        money           re 
## 3.441895e-01 1.356139e+05 1.009357e+01 7.339570e+00 4.604714e-01

Model predictions

pred <- predict(model)
head(pred)
##         1         2         3         4         5         6 
## -1.066563  1.917710  1.920744 -1.066563 -1.066563 -1.066563
summary(pred)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-17.68 -1.067 -1.067 -0.01852 0.02937 69.87

p <- predict(model, type = 'response')
head(p)
##         1         2         3         4         5         6 
## 0.2560573 0.8718828 0.8722213 0.2560573 0.2560573 0.2560573
summary(p)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0 0.2561 0.2561 0.394 0.5073 1

Model inspection & evaluation

Calibration

spam <- spam %>%
  mutate(
    prediction = predict(model, type='response'),
    rounded_pred = plyr::round_any(prediction, .1)
    )

spam %>%
  select(prediction, rounded_pred) %>%
  head
prediction rounded_pred
0.2560573 0.3
0.8718828 0.9
0.8722213 0.9
0.2560573 0.3
0.2560573 0.3
0.2560573 0.3

calibration <- spam %>%
  group_by(rounded_pred) %>%
  summarize(
    freq = mean(is_spam),
    pred = mean(prediction),
    count = n())

head(calibration)
rounded_pred freq pred count
0.0 0.0250000 0.0214004 120
0.1 0.0732601 0.1088481 273
0.2 0.1863636 0.1935259 440
0.3 0.2248224 0.2598529 2393
0.4 0.5625000 0.4049039 160
0.5 0.7200000 0.5012132 125

p <- calibration %>%
  ggplot(aes(pred, freq)) +
  geom_point(aes(size = count), alpha=0.8) +
  scale_size_area(max_size = 5, guide = FALSE) +
  geom_abline(intercept = 0, slope = 1)
p

Accuracy

head(spam$prediction)
##         1         2         3         4         5         6 
## 0.2560573 0.8718828 0.8722213 0.2560573 0.2560573 0.2560573
spam <- spam %>%
  mutate(pred_type = prediction > 0.5)

head(spam$pred_type)
##     1     2     3     4     5     6 
## FALSE  TRUE  TRUE FALSE FALSE FALSE

accuracy <- mean(spam$pred_type == spam$is_spam)
accuracy
## [1] 0.8072158

table(spam$is_spam)
0 1
2788 1813
mean(spam$is_spam == FALSE)
## [1] 0.6059552

Precision

pos <- spam %>%
  filter(pred_type == TRUE)

precision <- mean(pos$is_spam)
precision
## [1] 0.9012132

Sensitivity

true <- spam %>%
  filter(is_spam == TRUE)

recall <- mean(true$pred_type)
recall
## [1] 0.5736349

Specificity

false <- spam %>%
  filter(is_spam == FALSE)

specificity <- mean(1 - false$pred_type)
specificity
## [1] 0.9591105

Selecting the threshold

spam <- spam %>%
  mutate(pred_type_75 = prediction > .75)

pos75 <- spam %>%
  filter(pred_type_75 == TRUE)
true <- spam %>%
  filter(is_spam == TRUE)
false <- spam %>%
  filter(is_spam == FALSE)

New performance measures

performance <- tribble(
  ~threshold, ~precision, ~recall, ~specificity,
  .5, mean(pos$is_spam),
  mean(true$pred_type), mean(1 - false$pred_type),
  
  .75, mean(pos75$is_spam),
  mean(true$pred_type_75), mean(1 - false$pred_type_75)
)

performance
threshold precision recall specificity
0.50 0.9012132 0.5736349 0.9591105
0.75 0.9329412 0.4373966 0.9795552

ROC curve

library(ROCR)

pred <- prediction(spam$prediction, spam$is_spam)
perf <- performance(pred, "tpr", "fpr")
plot(perf)

AUC

auc <- performance(pred, "auc")
unlist(slot(auc, "y.values"))
## [1] 0.8230104