library(plyr)
library(tidyverse)
library(scales)
library(nycflights13)

theme_set(theme_bw())

word_counts <- read.csv('inaugural_words.csv')

Word clouds?

Word clouds comparing inauguration speech for Bush (2001) and Obama (2009)

Bush (2001)

Word cloud of Bush 2001 inauguration speech

Word cloud of Bush 2001 inauguration speech

Obama (2009)

Word cloud of Obama 2009 inauguration speech

Word cloud of Obama 2009 inauguration speech

Unambiguous representation of top 10 words

wc <- word_counts %>%
  group_by(name) %>%
  top_n(10, count) 

wc_bush <- wc %>%
  filter(name == 'bush') %>%
  arrange(count)

p <- ggplot(data=wc_bush, aes(x=word, y=count, group=name)) +
  ggtitle('Bush (2001)\n') +
  geom_bar(stat='identity') +
  scale_x_discrete('Word\n', limits=wc_bush$word) +
  scale_y_continuous('\nCount') +
  coord_flip()

p

wc_obama <- wc %>%
  filter(name == 'obama') %>%
  arrange(count)

p <- ggplot(data=wc_obama, aes(x=word, y=count, group=name)) +
  ggtitle('Obama (2009)\n') +
  geom_bar(stat='identity') +
  scale_x_discrete('Word\n', limits=wc_obama$word) +
  scale_y_continuous('\nCount') +
  coord_flip()

p

Daily volume of flights

plot_data <- flights %>%
  filter(month == 2) %>%
  mutate(date=as.Date(paste(year, month, day, sep='-'))) %>%
  group_by(date) %>%
  summarize(n=n())

head(plot_data)
date n
2013-02-01 926
2013-02-02 682
2013-02-03 814
2013-02-04 932
2013-02-05 896
2013-02-06 901
p <- ggplot(data=plot_data, aes(x=date, y=n)) +
  geom_point()
p

p <- p + 
  geom_line()
p

saturdays <- plot_data %>%
  filter(strftime(date, format='%u') == 6) %>%
  select(date)
p <- p + 
  scale_x_date(breaks=saturdays$date, date_label='%a %b %d')
p

p <- p + 
  scale_x_date(element_blank(), breaks=saturdays$date, date_label='%a %b %d') +
  scale_y_continuous("Flights per day\n", limits=c(600, 1000), labels=comma)
p

Flight delays

plot_data <- flights %>%
  mutate(delay=arr_delay, big_delay=ifelse(delay > 15, 1, 0)) %>%
  group_by(carrier) %>%
  summarize(p_big_delay=mean(big_delay, na.rm=TRUE)) %>%
  join(airlines, by='carrier') %>%
  mutate(short_name=gsub('(\\w+).*', '\\1', name))

head(plot_data)
carrier p_big_delay name short_name
9E 0.2492 Endeavor Air Inc. Endeavor
AA 0.1879 American Airlines Inc. American
AS 0.1439 Alaska Airlines Inc. Alaska
B6 0.2611 JetBlue Airways JetBlue
DL 0.1823 Delta Air Lines Inc. Delta
EV 0.3136 ExpressJet Airlines Inc. ExpressJet
p <- ggplot(data=plot_data, aes(x=p_big_delay, y=short_name)) +
  geom_point() +
  scale_x_continuous(element_blank(), label=percent, limits=c(0, 1)) +
  scale_y_discrete(element_blank()) +
  ggtitle('Percentage of flights more than 15 minutes late\n')
p

p <- p + 
  scale_x_continuous(element_blank(), label=percent, limits=c(.1, .4))
p

carriers_ranked <- plot_data %>%
  arrange(p_big_delay)
p <- p +  
  scale_x_continuous(element_blank(), label=percent, limits=c(.1, .4)) +
  scale_y_discrete(element_blank(), limits=carriers_ranked$short_name)
p

national <- c('JetBlue', 'Southwest', 'United', 'American', 'Delta', 'US')
plot_data <- plot_data %>%
  mutate(carrier_type=ifelse(short_name %in% national, 'National', 'Regional'))

p <- ggplot(data=plot_data, aes(x=p_big_delay, y=short_name)) +
  geom_point(aes(shape=carrier_type)) +
  scale_x_continuous(element_blank(), label=percent, limits=c(.1, .4)) +
  scale_y_discrete(element_blank(), limits=carriers_ranked$short_name) +
  scale_shape_manual('Carrier type', values=c(1, 16)) +
  theme(legend.position=c(1,0), legend.justification=c(1,0),
        legend.background=element_blank()) +
  ggtitle('Percentage of flights more than 15 minutes late\n')
p

Stop and Frisk

Hitrate by precinct

load('sqf.Rdata')
p <- ggplot(data=hitrate_by_precinct, aes(x=black, y=white)) +
  geom_point() +
  scale_x_continuous('\nHit rate for black suspects',
                     labels=percent, limits=c(0, .3)) +
  scale_y_continuous('Hit rate for white suspects\n',
                     labels=percent, limits=c(0, .5))

p

p <- ggplot(data=hitrate_by_precinct, aes(x=black, y=white)) +
  geom_point() +
  scale_x_continuous('\nHit rate for black suspects',
                     labels=percent, limits=c(0, .5)) +
  scale_y_continuous('Hit rate for white suspects\n',
                     labels=percent, limits=c(0, .5))

p

p <- ggplot(data=hitrate_by_precinct, aes(x=black, y=white)) +
  geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed') +
  scale_x_continuous('\nHit rate for black suspects',
                     labels=percent, limits=c(0, .5)) +
  scale_y_continuous('Hit rate for white suspects\n',
                     labels=percent, limits=c(0, .5))

p

p <- ggplot(data=hitrate_by_precinct, aes(x=black, y=white)) +
  geom_point(size=1) +
  geom_abline(slope=1, intercept=0, linetype='dashed') +
  scale_x_continuous('\nHit rate for black suspects',
                     labels=percent, limits=c(0, .5)) +
  scale_y_continuous('Hit rate for white suspects\n',
                     labels=percent, limits=c(0, .5))

p

Hitrate by precinct and location

p <- ggplot(data=hitrate_by_location, aes(x=black, y=white,
                                          group=location.housing)) +
  geom_point(aes(color=location.housing), alpha=.6) +
  geom_abline(slope=1, intercept=0, linetype='dashed') +
  scale_color_discrete(element_blank(),
                       breaks=c('housing', 'neither', 'transit'),
                       labels=c('Public housing', 'Pedestrian', 'Transit')) +
  scale_x_continuous('\nHit rate for black suspects',
                     labels=percent, limits=c(0, .8)) +
  scale_y_continuous('Hit rate for white suspects\n',
                     labels=percent, limits=c(0, .8)) +
  theme(legend.position=c(1, 0), legend.justification=c(1, 0),
        legend.background=element_blank())

p

p <- ggplot(data=hitrate_by_location, aes(x=black, y=white,
                                          group=location.housing)) +
  geom_point(aes(color=location.housing), alpha=.6) +
  geom_abline(slope=1, intercept=0, linetype='dashed') +
  scale_color_discrete(element_blank(),
                       breaks=c('housing', 'neither', 'transit'),
                       labels=c('Public housing', 'Pedestrian', 'Transit')) +
  scale_x_continuous('\nHit rate for black suspects',  labels=percent,
                     trans='log10', limits=c(0.003, 1),
                     breaks=c(.003, .01, .03, .1, .3, 1)) +
  scale_y_continuous('Hit rate for white suspects\n',  labels=percent,
                     trans='log10', limits=c(0.003, 1),
                     breaks=c(.003, .01, .03, .1, .3, 1)) +
  theme(legend.position=c(1, 0), legend.justification=c(1, 0),
        legend.background=element_blank())

p

p <- ggplot(data=hitrate_by_location, aes(x=black, y=white,
                                          group=location.housing)) +
  geom_point(aes(color=location.housing, size=count), alpha=.6) +
  geom_abline(slope=1, intercept=0, linetype='dashed') +
  scale_size_area(guide=FALSE) +
  scale_color_discrete(element_blank(),
                       breaks=c('housing', 'neither', 'transit'),
                       labels=c('Public housing', 'Pedestrian', 'Transit')) +
  scale_x_continuous('\nHit rate for black suspects',  labels=percent,
                     trans='log10', limits=c(0.003, 1),
                     breaks=c(.003, .01, .03, .1, .3, 1)) +
  scale_y_continuous('Hit rate for white suspects\n',  labels=percent,
                     trans='log10', limits=c(0.003, 1),
                     breaks=c(.003, .01, .03, .1, .3, 1)) +
  theme(legend.position=c(1, 0), legend.justification=c(1, 0),
        legend.background=element_blank())

p

Voter intent

Voter intent over time (2012 presidential election)

load('voter_intent.Rdata')
head(voter_intent)
date p_obama
2012-09-22 0.4649
2012-09-23 0.4883
2012-09-24 0.4744
2012-09-25 0.5069
2012-09-26 0.4729
2012-09-27 0.5108
p <- ggplot(data=voter_intent, aes(x=date, y=p_obama)) +
  geom_point() +
  geom_line() +
  scale_x_date(element_blank(), date_breaks='1 week', date_labels='%b. %d') +
  scale_y_continuous('Percent support for Obama\n', labels=percent,
                     limits=c(0, 1))
p

p <-  p +
  scale_y_continuous('Percent support for Obama\n', labels=percent,
                     limits=c(.3, .55))
p

p <-  p +
  geom_vline(data=debates, aes(xintercept=as.numeric(dates)), linetype='dashed')
p

debates$event <- paste(c('First', 'Second', 'Third'), 'debate', sep='\n')
p <-  p +
  geom_text(data=debates, aes(x=dates, y=.55, label=event),
            vjust=1, nudge_x=c(-3, -3, 3), size=3)
p

Demographic distribution

capitalize_all <- Vectorize(function(x) {
  s <- strsplit(x, " ")[[1]]
  paste(toupper(substring(s, 1,1)), substring(s, 2), sep="", collapse=" ")
})

p <- ggplot(demographic_dist, aes(x=attribute, y=p, group=source)) +
  geom_bar(aes(color=source, fill=source), stat='identity', position='dodge') +
  facet_grid(.~cat, scales="free_x", space="free") +
  scale_y_continuous(element_blank(), limits=c(0,1), labels=percent_format()) +
  scale_x_discrete(element_blank(), labels=capitalize_all) +
  theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1)) +
  theme(legend.position='bottom', legend.title=element_blank())

p

p <- ggplot(demographic_dist, aes(attribute, p, group=source)) +
  geom_point(aes(color=source)) +
  geom_line(aes(color=source)) +
  facet_grid(.~cat, scales="free_x", space="free") +
  scale_y_continuous(element_blank(), limits=c(0,1), labels=percent_format()) +
  scale_x_discrete(element_blank(), labels=capitalize_all) +
  theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1)) +
  theme(legend.position='bottom', legend.title=element_blank())

p

p <- ggplot(demographic_dist, aes(attribute, p, group=source)) +
  geom_point() +
  geom_line(aes(linetype=source)) +
  facet_grid(.~cat, scales="free_x", space="free") +
  scale_y_continuous(element_blank(), limits=c(0,1), labels=percent_format()) +
  scale_x_discrete(element_blank(), labels=capitalize_all) +
  theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1)) +
  theme(legend.position='bottom', legend.title=element_blank())

p

Third party Ads

load('third_party_ads.Rdata')

# Helper function to make labels pretty
getPrettyLdaTopicLabel <- function (topic) {
  pretty.topic <- sub('pub_games', 'Gaming', topic)
  pretty.topic <- sub('general', 'miscellaneous', pretty.topic)
  pretty.topic <- sub('(pub|ref|web-services|offline)_', '', pretty.topic)
  pretty.topic <- sub('_', ' ', pretty.topic)
  pretty.topic <- sub('peoplesearch', 'people search', pretty.topic)
  pretty.topic <- sub('porn', 'Pornography', pretty.topic)
  pretty.topic <- sub('gov', 'Government', pretty.topic)
  pretty.topic <- sub('entertainment tv', 'entertainment/TV', pretty.topic)
  pretty.topic <- gsub('entertainment (\\w)', 'entertainment/\\U\\1',
                       pretty.topic, perl=TRUE)
  pretty.topic <- gsub('gambling (\\w)', 'gambling/\\U\\1', pretty.topic,
                       perl=TRUE)
  pretty.topic <- gsub('^(\\w)', '\\U\\1', pretty.topic, perl=TRUE)
  pretty.topic
}
p <- ggplot(third_party_ads, aes(y=topic, x=pct_susceptible)) + 
  facet_grid(high_level_topic ~ ., space='free', scales='free', drop=TRUE) +
  geom_point() + 
  scale_y_discrete(element_blank(), labels=getPrettyLdaTopicLabel) +
  scale_x_continuous('\nProportion of traffic supported by third-party ads',
                     label=percent)
p

p <- ggplot(third_party_ads, 
            aes(y=topic, x=pct_susceptible, size=topic_pageviews)) + 
  facet_grid(high_level_topic ~ ., space='free', scales='free', drop=TRUE) +
  geom_point() + 
  scale_y_discrete(element_blank(), labels=getPrettyLdaTopicLabel) +
  scale_x_continuous('\nProportion of traffic supported by third-party ads',
                     label=percent) +
  scale_size_area(guide=FALSE, breaks=c(1e8, 2.5e8, 5e8, 1e9, 2e9),  
                  labels=c('100m', '250m', '500m', '1bn', '2bn'))
p

# Additional value to plot 
# Usually this would be computed from some data; it is presented here as a 
# hard-coded value for illustrative purposes
non_commerce_susceptibility_overall_pct <- 0.32

p <- ggplot(third_party_ads, 
            aes(y=topic, x=pct_susceptible, size=topic_pageviews)) + 
  facet_grid(high_level_topic ~ ., space='free', scales='free', drop=TRUE) +
  geom_point() + 
  scale_y_discrete(element_blank(), labels=getPrettyLdaTopicLabel) +
  scale_x_continuous('\nProportion of traffic supported by third-party ads',
                     label=percent) +
  scale_size_area(guide=FALSE, breaks=c(1e8, 2.5e8, 5e8, 1e9, 2e9),  
                  labels=c('100m', '250m', '500m', '1bn', '2bn')) + 
  geom_vline(aes(xintercept=pct_susceptible_high), linetype='dashed') +
  geom_vline(aes(xintercept=non_commerce_susceptibility_overall_pct),  
             linetype='solid')
p

Twitter visualization

load('twitter_vis.Rdata')

# function to append units (k, M, B) to labels
addUnits <- function(n) {
  labels <- ifelse(n < 1000, n,  # less than thousands
    ifelse(n < 1e6, paste0(round(n/1e3), 'k'),  # in thousands
    ifelse(n < 1e9, paste0(round(n/1e6), 'M'),  # in millions
    ifelse(n < 1e12, paste0(round(n/1e9), 'B'), # in billions
                            'too big!'
                            ))))
  return(labels)
}

# function to capitalize first letter of a string
capitalize <- function(s) {
  paste(toupper(substring(s, 1, 1)), substring(s, 2), sep="")
}

Cascade size histogram

size.dist.trunc <- size.dist %>%
  filter(size <= 1000)

p <- ggplot(data=size.dist.trunc, aes(x=size)) +
  geom_histogram(aes(weight=count), binwidth = 30) +
  scale_x_continuous('\nCascade Size') +
  scale_y_continuous('Number of cascades\n', label=addUnits) +
  coord_cartesian(xlim=c(0, 1000))
p

p <- ggplot(data=size.dist.trunc, aes(x=size)) +
  geom_histogram(aes(weight=count), binwidth = 30, position='dodge') +
  scale_x_continuous('\nCascade Size') +
  scale_y_log10('Number of cascades\n', label=addUnits) +
  coord_cartesian(xlim=c(0, 1000))
p

Warning!

Using specifying axis limits when using a stat_ geom (e.g., geom_histogram, geom_bar) could result in unexpected data loss. Always use coord_cartesian to adjust axis limits in such cases.

p <- ggplot(data=size.dist.trunc, aes(x=size)) +
  geom_histogram(aes(weight=count), binwidth = 30, position='dodge') +
  scale_x_continuous('\nCascade Size', limits=c(0, 1000)) +
  scale_y_log10('Number of cascades\n', label=addUnits)
p

Cascade size distribution

p <- ggplot(data=size.dist, aes(x=size, y=1-cdf)) +
  geom_line() + 
  scale_x_continuous(labels=comma, limits=c(1,1e4)) + 
  scale_y_log10(breaks=10^seq(-7,0), labels=sapply(10^seq(-7,0), percent),
                limits=c(1e-7, .1)) + 
  labs(x='\nCascade Size', y='CCDF')
p

p <- ggplot(data=size.dist, aes(x=size, y=1-cdf)) +
  geom_line() +
  scale_x_log10(labels=comma, breaks=10^(0:log10(max(size.dist$size))),
                limits=c(1,1e4)) +
  scale_y_log10(breaks=10^seq(-7,0), labels=sapply(10^seq(-7,0), percent),
                limits=c(1e-7, .1)) +
  labs(x='\nCascade Size', y='CCDF') 
p

p <- ggplot(data=dist.by.category, aes(x=size, y=ccdf, group=category)) +
  geom_line(aes(color=category)) +
  scale_x_log10(labels=comma, breaks=10^(0:log10(max(stats$size)))) +
  scale_y_log10(breaks=10^seq(-5,0), labels=sapply(10^seq(-5,0), percent)) +
  labs(x='\nCascade Size', y='CCDF') +
  theme(legend.position=c(0, 0), legend.justification=c(0, 0), 
        legend.title=element_blank(), legend.background=element_blank()) +
  scale_color_discrete(labels=capitalize, breaks=category.breaks)
p

Popularity vs. Virality

Scatter plots

p <- ggplot(data=stats, aes(x=size, y=avg.dist)) +
  geom_point() +
  scale_x_log10(labels=comma) +
  scale_y_continuous() +
  labs(x='Cascade size', y='Structural virality')
p

p <- ggplot(data=stats, aes(x=size, y=avg.dist)) +
  geom_point() +
  scale_x_log10(labels=comma) +
  scale_y_log10(labels=comma) +
  labs(x='Cascade size', y='Structural virality')
p

p <- ggplot(data=stats, aes(x=size, y=avg.dist)) +
  geom_point(size=1) +
  scale_x_log10(labels=comma) +
  scale_y_log10() +
  labs(x='Cascade size', y='Structural virality')
p

p <- ggplot(data=stats, aes(x=size, y=avg.dist)) +
  geom_point(size=1, alpha=0.5) +
  scale_x_log10(labels=comma) +
  scale_y_log10() +
  labs(x='Cascade size', y='Structural virality')
p

Box plots

p <- ggplot(data=stats.box, aes(x=factor(size.bin), y=avg.dist)) +
  geom_boxplot() +
  scale_x_discrete(labels=comma(c(100, 300, 1e3, 3e3, 1e4))) +
  scale_y_log10(breaks=c(3, 10, 30)) +
  labs(x='Cascade Size', y='Structural virality')
p

p <- ggplot(data=stats.box, aes(x=factor(size.bin), y=avg.dist)) +
  facet_grid(~ category, scales='free_x', space='free_x') +
  geom_boxplot() +
  scale_x_discrete(labels=comma(c(100, 300, 1e3, 3e3, 1e4))) +
  scale_y_log10(breaks=c(3, 10, 30, 100)) +
  labs(x='Cascade Size', y='Structural virality')
p