Linear regression for binary outcomes

names(spam)

##  [1] "make"              "address"           "all"              
##  [4] "num3d"             "our"               "over"             
##  [7] "remove"            "internet"          "order"            
## [10] "mail"              "receive"           "will"             
## [13] "people"            "report"            "addresses"        
## [16] "free"              "business"          "email"            
## [19] "you"               "credit"            "your"             
## [22] "font"              "num000"            "money"            
## [25] "hp"                "hpl"               "george"           
## [28] "num650"            "lab"               "labs"             
## [31] "telnet"            "num857"            "data"             
## [34] "num415"            "num85"             "technology"       
## [37] "num1999"           "parts"             "pm"               
## [40] "direct"            "cs"                "meeting"          
## [43] "original"          "project"           "re"               
## [46] "edu"               "table"             "conference"       
## [49] "char_semicolon"    "char_left_paren"   "char_left_bracket"
## [52] "char_exclamation"  "char_dollar"       "char_pound"       
## [55] "capital_avg"       "capital_long"      "capital_total"    
## [58] "is_spam"

table(spam$is_spam)

0	1
2788	1813

head(spam$money)

## [1] 0.00 0.43 0.06 0.00 0.00 0.00

summary(spam$money)

Min.	1st Qu.	Median	Mean	3rd Qu.	Max.
0	0	0	0.09427	0	12.5

model <- lm(is_spam ~ 1 + char_dollar + credit +
              money + re, data = spam)
model

## 
## Call:
## lm(formula = is_spam ~ 1 + char_dollar + credit + money + re, 
##     data = spam)
## 
## Coefficients:
## (Intercept)  char_dollar       credit        money           re  
##     0.33459      0.58551      0.15752      0.18794     -0.05355

pred <- predict(model)
head(pred)

##         1         2         3         4         5         6 
## 0.3345915 0.5207985 0.5007950 0.3345915 0.3345915 0.3345915

summary(pred)

Min.	1st Qu.	Median	Mean	3rd Qu.	Max.
-0.8125	0.3346	0.3346	0.394	0.3954	3.849

Logistic regression

inv_logit <- function(x) {
  exp(x)/(1 + exp(x))
}

inv_logit(0)

## [1] 0.5

Maximum likelihood estimation

model <- glm(is_spam ~ 1 + char_dollar + credit +  
               money + re, data = spam, 
             family = 'binomial')
model

## 
## Call:  glm(formula = is_spam ~ 1 + char_dollar + credit + money + re, 
##     family = "binomial", data = spam)
## 
## Coefficients:
## (Intercept)  char_dollar       credit        money           re  
##     -1.0666      11.8176       2.3119       1.9933      -0.7755  
## 
## Degrees of Freedom: 4600 Total (i.e. Null);  4596 Residual
## Null Deviance:       6170 
## Residual Deviance: 4428  AIC: 4438

Interpreting regression coefficients

coef(model)

## (Intercept) char_dollar      credit       money          re 
##  -1.0665628  11.8175673   2.3118984   1.9932803  -0.7755045

exp(coef(model))

##  (Intercept)  char_dollar       credit        money           re 
## 3.441895e-01 1.356139e+05 1.009357e+01 7.339570e+00 4.604714e-01

Model predictions

pred <- predict(model)
head(pred)

##         1         2         3         4         5         6 
## -1.066563  1.917710  1.920744 -1.066563 -1.066563 -1.066563

summary(pred)

Min.	1st Qu.	Median	Mean	3rd Qu.	Max.
-17.68	-1.067	-1.067	-0.01852	0.02937	69.87

p <- predict(model, type = 'response')
head(p)

##         1         2         3         4         5         6 
## 0.2560573 0.8718828 0.8722213 0.2560573 0.2560573 0.2560573

summary(p)

Min.	1st Qu.	Median	Mean	3rd Qu.	Max.
0	0.2561	0.2561	0.394	0.5073	1

Model inspection & evaluation

Calibration

spam <- spam %>%
  mutate(
    prediction = predict(model, type='response'),
    rounded_pred = plyr::round_any(prediction, .1)
    )

spam %>%
  select(prediction, rounded_pred) %>%
  head

prediction	rounded_pred
0.2560573	0.3
0.8718828	0.9
0.8722213	0.9
0.2560573	0.3
0.2560573	0.3
0.2560573	0.3

calibration <- spam %>%
  group_by(rounded_pred) %>%
  summarize(
    freq = mean(is_spam),
    pred = mean(prediction),
    count = n())

head(calibration)

rounded_pred	freq	pred	count
0.0	0.0250000	0.0214004	120
0.1	0.0732601	0.1088481	273
0.2	0.1863636	0.1935259	440
0.3	0.2248224	0.2598529	2393
0.4	0.5625000	0.4049039	160
0.5	0.7200000	0.5012132	125

p <- calibration %>%
  ggplot(aes(pred, freq)) +
  geom_point(aes(size = count), alpha=0.8) +
  scale_size_area(max_size = 5, guide = FALSE) +
  geom_abline(intercept = 0, slope = 1)
p

Accuracy

head(spam$prediction)

##         1         2         3         4         5         6 
## 0.2560573 0.8718828 0.8722213 0.2560573 0.2560573 0.2560573

spam <- spam %>%
  mutate(pred_type = prediction > 0.5)

head(spam$pred_type)

##     1     2     3     4     5     6 
## FALSE  TRUE  TRUE FALSE FALSE FALSE

accuracy <- mean(spam$pred_type == spam$is_spam)
accuracy

## [1] 0.8072158

table(spam$is_spam)

0	1
2788	1813

mean(spam$is_spam == FALSE)

## [1] 0.6059552

Precision

pos <- spam %>%
  filter(pred_type == TRUE)

precision <- mean(pos$is_spam)
precision

## [1] 0.9012132

Sensitivity

true <- spam %>%
  filter(is_spam == TRUE)

recall <- mean(true$pred_type)
recall

## [1] 0.5736349

Specificity

false <- spam %>%
  filter(is_spam == FALSE)

specificity <- mean(1 - false$pred_type)
specificity

## [1] 0.9591105

Selecting the threshold

spam <- spam %>%
  mutate(pred_type_75 = prediction > .75)

pos75 <- spam %>%
  filter(pred_type_75 == TRUE)
true <- spam %>%
  filter(is_spam == TRUE)
false <- spam %>%
  filter(is_spam == FALSE)

New performance measures

performance <- tribble(
  ~threshold, ~precision, ~recall, ~specificity,
  .5, mean(pos$is_spam),
  mean(true$pred_type), mean(1 - false$pred_type),
  
  .75, mean(pos75$is_spam),
  mean(true$pred_type_75), mean(1 - false$pred_type_75)
)

performance

threshold	precision	recall	specificity
0.50	0.9012132	0.5736349	0.9591105
0.75	0.9329412	0.4373966	0.9795552

ROC curve

library(ROCR)

pred <- prediction(spam$prediction, spam$is_spam)
perf <- performance(pred, "tpr", "fpr")
plot(perf)

AUC

auc <- performance(pred, "auc")
unlist(slot(auc, "y.values"))

## [1] 0.8230104

Logistic Regression

Linear regression for binary outcomes

Logistic regression

Maximum likelihood estimation

Interpreting regression coefficients

Model predictions

Model inspection & evaluation

Calibration

Accuracy

Precision

Sensitivity

Specificity

Selecting the threshold

New performance measures

ROC curve

AUC