Linear regression for binary outcomes
names(spam) 
##  [1] "make"              "address"           "all"              
##  [4] "num3d"             "our"               "over"             
##  [7] "remove"            "internet"          "order"            
## [10] "mail"              "receive"           "will"             
## [13] "people"            "report"            "addresses"        
## [16] "free"              "business"          "email"            
## [19] "you"               "credit"            "your"             
## [22] "font"              "num000"            "money"            
## [25] "hp"                "hpl"               "george"           
## [28] "num650"            "lab"               "labs"             
## [31] "telnet"            "num857"            "data"             
## [34] "num415"            "num85"             "technology"       
## [37] "num1999"           "parts"             "pm"               
## [40] "direct"            "cs"                "meeting"          
## [43] "original"          "project"           "re"               
## [46] "edu"               "table"             "conference"       
## [49] "char_semicolon"    "char_left_paren"   "char_left_bracket"
## [52] "char_exclamation"  "char_dollar"       "char_pound"       
## [55] "capital_avg"       "capital_long"      "capital_total"    
## [58] "is_spam"
table(spam$is_spam)
head(spam$money)
## [1] 0.00 0.43 0.06 0.00 0.00 0.00
summary(spam$money)
model <- lm(is_spam ~ 1 + char_dollar + credit +
              money + re, data = spam)
model
## 
## Call:
## lm(formula = is_spam ~ 1 + char_dollar + credit + money + re, 
##     data = spam)
## 
## Coefficients:
## (Intercept)  char_dollar       credit        money           re  
##     0.33459      0.58551      0.15752      0.18794     -0.05355
pred <- predict(model)
head(pred)
##         1         2         3         4         5         6 
## 0.3345915 0.5207985 0.5007950 0.3345915 0.3345915 0.3345915
summary(pred)
| -0.8125 | 0.3346 | 0.3346 | 0.394 | 0.3954 | 3.849 | 
 
Logistic regression
inv_logit <- function(x) {
  exp(x)/(1 + exp(x))
}
inv_logit(0)
## [1] 0.5

 
Maximum likelihood estimation
model <- glm(is_spam ~ 1 + char_dollar + credit +  
               money + re, data = spam, 
             family = 'binomial')
model
## 
## Call:  glm(formula = is_spam ~ 1 + char_dollar + credit + money + re, 
##     family = "binomial", data = spam)
## 
## Coefficients:
## (Intercept)  char_dollar       credit        money           re  
##     -1.0666      11.8176       2.3119       1.9933      -0.7755  
## 
## Degrees of Freedom: 4600 Total (i.e. Null);  4596 Residual
## Null Deviance:       6170 
## Residual Deviance: 4428  AIC: 4438
 
Interpreting regression coefficients
coef(model)
## (Intercept) char_dollar      credit       money          re 
##  -1.0665628  11.8175673   2.3118984   1.9932803  -0.7755045
exp(coef(model))
##  (Intercept)  char_dollar       credit        money           re 
## 3.441895e-01 1.356139e+05 1.009357e+01 7.339570e+00 4.604714e-01
 
Model predictions
pred <- predict(model)
head(pred)
##         1         2         3         4         5         6 
## -1.066563  1.917710  1.920744 -1.066563 -1.066563 -1.066563
summary(pred)
| -17.68 | -1.067 | -1.067 | -0.01852 | 0.02937 | 69.87 | 
p <- predict(model, type = 'response')
head(p)
##         1         2         3         4         5         6 
## 0.2560573 0.8718828 0.8722213 0.2560573 0.2560573 0.2560573
summary(p)
| 0 | 0.2561 | 0.2561 | 0.394 | 0.5073 | 1 | 
 
Model inspection & evaluation
Calibration
spam <- spam %>%
  mutate(
    prediction = predict(model, type='response'),
    rounded_pred = plyr::round_any(prediction, .1)
    )
spam %>%
  select(prediction, rounded_pred) %>%
  head
| 0.2560573 | 0.3 | 
| 0.8718828 | 0.9 | 
| 0.8722213 | 0.9 | 
| 0.2560573 | 0.3 | 
| 0.2560573 | 0.3 | 
| 0.2560573 | 0.3 | 
calibration <- spam %>%
  group_by(rounded_pred) %>%
  summarize(
    freq = mean(is_spam),
    pred = mean(prediction),
    count = n())
head(calibration)
| 0.0 | 0.0250000 | 0.0214004 | 120 | 
| 0.1 | 0.0732601 | 0.1088481 | 273 | 
| 0.2 | 0.1863636 | 0.1935259 | 440 | 
| 0.3 | 0.2248224 | 0.2598529 | 2393 | 
| 0.4 | 0.5625000 | 0.4049039 | 160 | 
| 0.5 | 0.7200000 | 0.5012132 | 125 | 
p <- calibration %>%
  ggplot(aes(pred, freq)) +
  geom_point(aes(size = count), alpha=0.8) +
  scale_size_area(max_size = 5, guide = FALSE) +
  geom_abline(intercept = 0, slope = 1)
p

 
Accuracy
head(spam$prediction)
##         1         2         3         4         5         6 
## 0.2560573 0.8718828 0.8722213 0.2560573 0.2560573 0.2560573
spam <- spam %>%
  mutate(pred_type = prediction > 0.5)
head(spam$pred_type)
##     1     2     3     4     5     6 
## FALSE  TRUE  TRUE FALSE FALSE FALSE
accuracy <- mean(spam$pred_type == spam$is_spam)
accuracy
## [1] 0.8072158
table(spam$is_spam)
mean(spam$is_spam == FALSE)
## [1] 0.6059552
 
Precision
pos <- spam %>%
  filter(pred_type == TRUE)
precision <- mean(pos$is_spam)
precision
## [1] 0.9012132
 
Sensitivity
true <- spam %>%
  filter(is_spam == TRUE)
recall <- mean(true$pred_type)
recall
## [1] 0.5736349
 
Specificity
false <- spam %>%
  filter(is_spam == FALSE)
specificity <- mean(1 - false$pred_type)
specificity
## [1] 0.9591105
 
Selecting the threshold
spam <- spam %>%
  mutate(pred_type_75 = prediction > .75)
pos75 <- spam %>%
  filter(pred_type_75 == TRUE)
true <- spam %>%
  filter(is_spam == TRUE)
false <- spam %>%
  filter(is_spam == FALSE)
 
ROC curve
library(ROCR)
pred <- prediction(spam$prediction, spam$is_spam)
perf <- performance(pred, "tpr", "fpr")
plot(perf)

 
AUC
auc <- performance(pred, "auc")
unlist(slot(auc, "y.values"))
## [1] 0.8230104