library(plyr)
library(tidyverse)
library(scales)
library(nycflights13)
theme_set(theme_bw())
word_counts <- read.csv('inaugural_words.csv')
Word clouds comparing inauguration speech for Bush (2001) and Obama (2009)
wc <- word_counts %>%
group_by(name) %>%
top_n(10, count)
wc_bush <- wc %>%
filter(name == 'bush') %>%
arrange(count)
p <- ggplot(data=wc_bush, aes(x=word, y=count, group=name)) +
ggtitle('Bush (2001)\n') +
geom_bar(stat='identity') +
scale_x_discrete('Word\n', limits=wc_bush$word) +
scale_y_continuous('\nCount') +
coord_flip()
p
wc_obama <- wc %>%
filter(name == 'obama') %>%
arrange(count)
p <- ggplot(data=wc_obama, aes(x=word, y=count, group=name)) +
ggtitle('Obama (2009)\n') +
geom_bar(stat='identity') +
scale_x_discrete('Word\n', limits=wc_obama$word) +
scale_y_continuous('\nCount') +
coord_flip()
p
plot_data <- flights %>%
filter(month == 2) %>%
mutate(date=as.Date(paste(year, month, day, sep='-'))) %>%
group_by(date) %>%
summarize(n=n())
head(plot_data)
date | n |
---|---|
2013-02-01 | 926 |
2013-02-02 | 682 |
2013-02-03 | 814 |
2013-02-04 | 932 |
2013-02-05 | 896 |
2013-02-06 | 901 |
p <- ggplot(data=plot_data, aes(x=date, y=n)) +
geom_point()
p
p <- p +
geom_line()
p
saturdays <- plot_data %>%
filter(strftime(date, format='%u') == 6) %>%
select(date)
p <- p +
scale_x_date(breaks=saturdays$date, date_label='%a %b %d')
p
p <- p +
scale_x_date(element_blank(), breaks=saturdays$date, date_label='%a %b %d') +
scale_y_continuous("Flights per day\n", limits=c(600, 1000), labels=comma)
p
plot_data <- flights %>%
mutate(delay=arr_delay, big_delay=ifelse(delay > 15, 1, 0)) %>%
group_by(carrier) %>%
summarize(p_big_delay=mean(big_delay, na.rm=TRUE)) %>%
join(airlines, by='carrier') %>%
mutate(short_name=gsub('(\\w+).*', '\\1', name))
head(plot_data)
carrier | p_big_delay | name | short_name |
---|---|---|---|
9E | 0.2492 | Endeavor Air Inc. | Endeavor |
AA | 0.1879 | American Airlines Inc. | American |
AS | 0.1439 | Alaska Airlines Inc. | Alaska |
B6 | 0.2611 | JetBlue Airways | JetBlue |
DL | 0.1823 | Delta Air Lines Inc. | Delta |
EV | 0.3136 | ExpressJet Airlines Inc. | ExpressJet |
p <- ggplot(data=plot_data, aes(x=p_big_delay, y=short_name)) +
geom_point() +
scale_x_continuous(element_blank(), label=percent, limits=c(0, 1)) +
scale_y_discrete(element_blank()) +
ggtitle('Percentage of flights more than 15 minutes late\n')
p
p <- p +
scale_x_continuous(element_blank(), label=percent, limits=c(.1, .4))
p
carriers_ranked <- plot_data %>%
arrange(p_big_delay)
p <- p +
scale_x_continuous(element_blank(), label=percent, limits=c(.1, .4)) +
scale_y_discrete(element_blank(), limits=carriers_ranked$short_name)
p
national <- c('JetBlue', 'Southwest', 'United', 'American', 'Delta', 'US')
plot_data <- plot_data %>%
mutate(carrier_type=ifelse(short_name %in% national, 'National', 'Regional'))
p <- ggplot(data=plot_data, aes(x=p_big_delay, y=short_name)) +
geom_point(aes(shape=carrier_type)) +
scale_x_continuous(element_blank(), label=percent, limits=c(.1, .4)) +
scale_y_discrete(element_blank(), limits=carriers_ranked$short_name) +
scale_shape_manual('Carrier type', values=c(1, 16)) +
theme(legend.position=c(1,0), legend.justification=c(1,0),
legend.background=element_blank()) +
ggtitle('Percentage of flights more than 15 minutes late\n')
p
load('sqf.Rdata')
p <- ggplot(data=hitrate_by_precinct, aes(x=black, y=white)) +
geom_point() +
scale_x_continuous('\nHit rate for black suspects',
labels=percent, limits=c(0, .3)) +
scale_y_continuous('Hit rate for white suspects\n',
labels=percent, limits=c(0, .5))
p
p <- ggplot(data=hitrate_by_precinct, aes(x=black, y=white)) +
geom_point() +
scale_x_continuous('\nHit rate for black suspects',
labels=percent, limits=c(0, .5)) +
scale_y_continuous('Hit rate for white suspects\n',
labels=percent, limits=c(0, .5))
p
p <- ggplot(data=hitrate_by_precinct, aes(x=black, y=white)) +
geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed') +
scale_x_continuous('\nHit rate for black suspects',
labels=percent, limits=c(0, .5)) +
scale_y_continuous('Hit rate for white suspects\n',
labels=percent, limits=c(0, .5))
p
p <- ggplot(data=hitrate_by_precinct, aes(x=black, y=white)) +
geom_point(size=1) +
geom_abline(slope=1, intercept=0, linetype='dashed') +
scale_x_continuous('\nHit rate for black suspects',
labels=percent, limits=c(0, .5)) +
scale_y_continuous('Hit rate for white suspects\n',
labels=percent, limits=c(0, .5))
p
p <- ggplot(data=hitrate_by_location, aes(x=black, y=white,
group=location.housing)) +
geom_point(aes(color=location.housing), alpha=.6) +
geom_abline(slope=1, intercept=0, linetype='dashed') +
scale_color_discrete(element_blank(),
breaks=c('housing', 'neither', 'transit'),
labels=c('Public housing', 'Pedestrian', 'Transit')) +
scale_x_continuous('\nHit rate for black suspects',
labels=percent, limits=c(0, .8)) +
scale_y_continuous('Hit rate for white suspects\n',
labels=percent, limits=c(0, .8)) +
theme(legend.position=c(1, 0), legend.justification=c(1, 0),
legend.background=element_blank())
p
p <- ggplot(data=hitrate_by_location, aes(x=black, y=white,
group=location.housing)) +
geom_point(aes(color=location.housing), alpha=.6) +
geom_abline(slope=1, intercept=0, linetype='dashed') +
scale_color_discrete(element_blank(),
breaks=c('housing', 'neither', 'transit'),
labels=c('Public housing', 'Pedestrian', 'Transit')) +
scale_x_continuous('\nHit rate for black suspects', labels=percent,
trans='log10', limits=c(0.003, 1),
breaks=c(.003, .01, .03, .1, .3, 1)) +
scale_y_continuous('Hit rate for white suspects\n', labels=percent,
trans='log10', limits=c(0.003, 1),
breaks=c(.003, .01, .03, .1, .3, 1)) +
theme(legend.position=c(1, 0), legend.justification=c(1, 0),
legend.background=element_blank())
p
p <- ggplot(data=hitrate_by_location, aes(x=black, y=white,
group=location.housing)) +
geom_point(aes(color=location.housing, size=count), alpha=.6) +
geom_abline(slope=1, intercept=0, linetype='dashed') +
scale_size_area(guide=FALSE) +
scale_color_discrete(element_blank(),
breaks=c('housing', 'neither', 'transit'),
labels=c('Public housing', 'Pedestrian', 'Transit')) +
scale_x_continuous('\nHit rate for black suspects', labels=percent,
trans='log10', limits=c(0.003, 1),
breaks=c(.003, .01, .03, .1, .3, 1)) +
scale_y_continuous('Hit rate for white suspects\n', labels=percent,
trans='log10', limits=c(0.003, 1),
breaks=c(.003, .01, .03, .1, .3, 1)) +
theme(legend.position=c(1, 0), legend.justification=c(1, 0),
legend.background=element_blank())
p
load('voter_intent.Rdata')
head(voter_intent)
date | p_obama |
---|---|
2012-09-22 | 0.4649 |
2012-09-23 | 0.4883 |
2012-09-24 | 0.4744 |
2012-09-25 | 0.5069 |
2012-09-26 | 0.4729 |
2012-09-27 | 0.5108 |
p <- ggplot(data=voter_intent, aes(x=date, y=p_obama)) +
geom_point() +
geom_line() +
scale_x_date(element_blank(), date_breaks='1 week', date_labels='%b. %d') +
scale_y_continuous('Percent support for Obama\n', labels=percent,
limits=c(0, 1))
p
p <- p +
scale_y_continuous('Percent support for Obama\n', labels=percent,
limits=c(.3, .55))
p
p <- p +
geom_vline(data=debates, aes(xintercept=as.numeric(dates)), linetype='dashed')
p
debates$event <- paste(c('First', 'Second', 'Third'), 'debate', sep='\n')
p <- p +
geom_text(data=debates, aes(x=dates, y=.55, label=event),
vjust=1, nudge_x=c(-3, -3, 3), size=3)
p
capitalize_all <- Vectorize(function(x) {
s <- strsplit(x, " ")[[1]]
paste(toupper(substring(s, 1,1)), substring(s, 2), sep="", collapse=" ")
})
p <- ggplot(demographic_dist, aes(x=attribute, y=p, group=source)) +
geom_bar(aes(color=source, fill=source), stat='identity', position='dodge') +
facet_grid(.~cat, scales="free_x", space="free") +
scale_y_continuous(element_blank(), limits=c(0,1), labels=percent_format()) +
scale_x_discrete(element_blank(), labels=capitalize_all) +
theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1)) +
theme(legend.position='bottom', legend.title=element_blank())
p
p <- ggplot(demographic_dist, aes(attribute, p, group=source)) +
geom_point(aes(color=source)) +
geom_line(aes(color=source)) +
facet_grid(.~cat, scales="free_x", space="free") +
scale_y_continuous(element_blank(), limits=c(0,1), labels=percent_format()) +
scale_x_discrete(element_blank(), labels=capitalize_all) +
theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1)) +
theme(legend.position='bottom', legend.title=element_blank())
p
p <- ggplot(demographic_dist, aes(attribute, p, group=source)) +
geom_point() +
geom_line(aes(linetype=source)) +
facet_grid(.~cat, scales="free_x", space="free") +
scale_y_continuous(element_blank(), limits=c(0,1), labels=percent_format()) +
scale_x_discrete(element_blank(), labels=capitalize_all) +
theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1)) +
theme(legend.position='bottom', legend.title=element_blank())
p
load('third_party_ads.Rdata')
# Helper function to make labels pretty
getPrettyLdaTopicLabel <- function (topic) {
pretty.topic <- sub('pub_games', 'Gaming', topic)
pretty.topic <- sub('general', 'miscellaneous', pretty.topic)
pretty.topic <- sub('(pub|ref|web-services|offline)_', '', pretty.topic)
pretty.topic <- sub('_', ' ', pretty.topic)
pretty.topic <- sub('peoplesearch', 'people search', pretty.topic)
pretty.topic <- sub('porn', 'Pornography', pretty.topic)
pretty.topic <- sub('gov', 'Government', pretty.topic)
pretty.topic <- sub('entertainment tv', 'entertainment/TV', pretty.topic)
pretty.topic <- gsub('entertainment (\\w)', 'entertainment/\\U\\1',
pretty.topic, perl=TRUE)
pretty.topic <- gsub('gambling (\\w)', 'gambling/\\U\\1', pretty.topic,
perl=TRUE)
pretty.topic <- gsub('^(\\w)', '\\U\\1', pretty.topic, perl=TRUE)
pretty.topic
}
p <- ggplot(third_party_ads, aes(y=topic, x=pct_susceptible)) +
facet_grid(high_level_topic ~ ., space='free', scales='free', drop=TRUE) +
geom_point() +
scale_y_discrete(element_blank(), labels=getPrettyLdaTopicLabel) +
scale_x_continuous('\nProportion of traffic supported by third-party ads',
label=percent)
p
p <- ggplot(third_party_ads,
aes(y=topic, x=pct_susceptible, size=topic_pageviews)) +
facet_grid(high_level_topic ~ ., space='free', scales='free', drop=TRUE) +
geom_point() +
scale_y_discrete(element_blank(), labels=getPrettyLdaTopicLabel) +
scale_x_continuous('\nProportion of traffic supported by third-party ads',
label=percent) +
scale_size_area(guide=FALSE, breaks=c(1e8, 2.5e8, 5e8, 1e9, 2e9),
labels=c('100m', '250m', '500m', '1bn', '2bn'))
p
# Additional value to plot
# Usually this would be computed from some data; it is presented here as a
# hard-coded value for illustrative purposes
non_commerce_susceptibility_overall_pct <- 0.32
p <- ggplot(third_party_ads,
aes(y=topic, x=pct_susceptible, size=topic_pageviews)) +
facet_grid(high_level_topic ~ ., space='free', scales='free', drop=TRUE) +
geom_point() +
scale_y_discrete(element_blank(), labels=getPrettyLdaTopicLabel) +
scale_x_continuous('\nProportion of traffic supported by third-party ads',
label=percent) +
scale_size_area(guide=FALSE, breaks=c(1e8, 2.5e8, 5e8, 1e9, 2e9),
labels=c('100m', '250m', '500m', '1bn', '2bn')) +
geom_vline(aes(xintercept=pct_susceptible_high), linetype='dashed') +
geom_vline(aes(xintercept=non_commerce_susceptibility_overall_pct),
linetype='solid')
p
load('twitter_vis.Rdata')
# function to append units (k, M, B) to labels
addUnits <- function(n) {
labels <- ifelse(n < 1000, n, # less than thousands
ifelse(n < 1e6, paste0(round(n/1e3), 'k'), # in thousands
ifelse(n < 1e9, paste0(round(n/1e6), 'M'), # in millions
ifelse(n < 1e12, paste0(round(n/1e9), 'B'), # in billions
'too big!'
))))
return(labels)
}
# function to capitalize first letter of a string
capitalize <- function(s) {
paste(toupper(substring(s, 1, 1)), substring(s, 2), sep="")
}
size.dist.trunc <- size.dist %>%
filter(size <= 1000)
p <- ggplot(data=size.dist.trunc, aes(x=size)) +
geom_histogram(aes(weight=count), binwidth = 30) +
scale_x_continuous('\nCascade Size') +
scale_y_continuous('Number of cascades\n', label=addUnits) +
coord_cartesian(xlim=c(0, 1000))
p
p <- ggplot(data=size.dist.trunc, aes(x=size)) +
geom_histogram(aes(weight=count), binwidth = 30, position='dodge') +
scale_x_continuous('\nCascade Size') +
scale_y_log10('Number of cascades\n', label=addUnits) +
coord_cartesian(xlim=c(0, 1000))
p
Using specifying axis limits
when using a stat_
geom (e.g., geom_histogram
, geom_bar
) could result in unexpected data loss. Always use coord_cartesian
to adjust axis limits in such cases.
p <- ggplot(data=size.dist.trunc, aes(x=size)) +
geom_histogram(aes(weight=count), binwidth = 30, position='dodge') +
scale_x_continuous('\nCascade Size', limits=c(0, 1000)) +
scale_y_log10('Number of cascades\n', label=addUnits)
p
p <- ggplot(data=size.dist, aes(x=size, y=1-cdf)) +
geom_line() +
scale_x_continuous(labels=comma, limits=c(1,1e4)) +
scale_y_log10(breaks=10^seq(-7,0), labels=sapply(10^seq(-7,0), percent),
limits=c(1e-7, .1)) +
labs(x='\nCascade Size', y='CCDF')
p
p <- ggplot(data=size.dist, aes(x=size, y=1-cdf)) +
geom_line() +
scale_x_log10(labels=comma, breaks=10^(0:log10(max(size.dist$size))),
limits=c(1,1e4)) +
scale_y_log10(breaks=10^seq(-7,0), labels=sapply(10^seq(-7,0), percent),
limits=c(1e-7, .1)) +
labs(x='\nCascade Size', y='CCDF')
p
p <- ggplot(data=dist.by.category, aes(x=size, y=ccdf, group=category)) +
geom_line(aes(color=category)) +
scale_x_log10(labels=comma, breaks=10^(0:log10(max(stats$size)))) +
scale_y_log10(breaks=10^seq(-5,0), labels=sapply(10^seq(-5,0), percent)) +
labs(x='\nCascade Size', y='CCDF') +
theme(legend.position=c(0, 0), legend.justification=c(0, 0),
legend.title=element_blank(), legend.background=element_blank()) +
scale_color_discrete(labels=capitalize, breaks=category.breaks)
p