Chapter 5 Results
This is the overall data preprocessing
<-read.csv(file = 'ca-educational-attainment-personal-income-2008-2014.csv', header = TRUE)
data<-data %>% filter(Age == '18 to 64')
data_p$Educational.Attainment<-factor(data_p$Educational.Attainment,
data_plevels = c("Bachelor's degree or higher","Some college, less than 4-yr degree",
"High school or equivalent","No high school diploma"))
$Personal.Income<-factor(data_p$Personal.Income,
data_plevels = c('$75,000 and over','$50,000 to $74,999','$35,000 to $49,999','$25,000 to $34,999',
'$15,000 to $24,999','$10,000 to $14,999','$5,000 to $9,999','No Income'))
$Year<-fct_recode(data_p$Year,
data_p"2008"='01/01/2008 12:00:00 AM',"2009"='01/01/2009 12:00:00 AM',
"2010"='01/01/2010 12:00:00 AM',"2011"='01/01/2011 12:00:00 AM',
"2012"='01/01/2012 12:00:00 AM',"2013"='01/01/2013 12:00:00 AM',
"2014"='01/01/2014 12:00:00 AM')
$Educational.Attainment<- fct_recode(data_p$Educational.Attainment,
data_p">=Bachelor" = "Bachelor's degree or higher","College (<4year)" = "Some college, less than 4-yr degree",
"High school or equivalent" = "High school or equivalent","< high school" = "No high school diploma")
$Personal.Income<- fct_recode(data_p$Personal.Income,
data_p"[75k,inf)"='$75,000 and over',"[50k,75k)"='$50,000 to $74,999',"[35k,50k)"='$35,000 to $49,999',
"[25k,35k)"='$25,000 to $34,999',"[15k,25k)"='$15,000 to $24,999',"[10k,15k)"='$10,000 to $14,999',
"[5k,10k)"='$5,000 to $9,999','No Income'='No Income')
All candidate plots are as follows.
5.1 Static
5.1.1 Alluvial
#make it a function in convenience to plot by year
<-function(year=0){
plot_alluvialif (!year==0){
<- data_p %>% filter(Year == year) %>%
data_alluvial group_by(Gender, Educational.Attainment, Personal.Income) %>%
summarise(Freq = sum(Population.Count)) %>% ungroup()
else{
}<- data_p %>% group_by(Gender, Educational.Attainment, Personal.Income) %>%
data_alluvial summarise(Freq = sum(Population.Count)) %>% ungroup()
}
<-ggplot(data = data_alluvial, aes(y = Freq, axis1 = Gender, axis2 = Educational.Attainment,
paxis3 = Personal.Income))+
geom_alluvium(aes(fill = Gender), width = 1/6) +
geom_stratum(width = 1/12, fill = "grey80", color = "grey") +
geom_label(stat = "stratum",
aes(label = after_stat(stratum))) +
scale_x_discrete(expand = c(.05, .05)) +
scale_fill_brewer(type = "qual", palette = "Set1") +
ggtitle("") +
theme_void()
return (p)
}
plot_alluvial()
#If we omit educational attempt, relation between gender and income are clearer.
#Or maybe stacked bar?
<- data_p %>% group_by(Gender, Personal.Income) %>%
data_alluvial_2 summarise(Freq = sum(Population.Count)) %>% ungroup()
ggplot(data = data_alluvial_2, aes(y = Freq, axis1 = Gender, axis2 = fct_rev(fct_relevel(Personal.Income,'No Income'))))+
geom_alluvium(aes(fill = Gender), width = 1/6) +
geom_stratum(width = 1/12, fill = "grey80", color = "grey") +
geom_label(stat = "stratum",
aes(label = after_stat(stratum))) +
scale_x_discrete(expand = c(.05, .05)) +
scale_fill_brewer(type = "qual", palette = "Set1") +
ggtitle("") +
theme_void()
#Over years
<- data_p %>% group_by(Gender, Educational.Attainment, Personal.Income) %>%
data_alluvial_year summarise(Freq = sum(Population.Count), Year = Year) %>% ungroup()
#If we use facet, too small for observation
ggplot(data = data_alluvial_year, aes(y = Freq, axis1 = Gender, axis2 = Educational.Attainment,axis3 = Personal.Income))+
geom_alluvium(aes(fill = Gender), width = 1/6) +
geom_stratum(width = 1/12, fill = "grey80", color = "grey") +
geom_label(stat = "stratum",
aes(label = after_stat(stratum))) +
scale_x_discrete(expand = c(.05, .05)) +
scale_fill_brewer(type = "qual", palette = "Set1") +
facet_wrap(vars(Year))+
ggtitle("") +
theme_void()
#Do it separately for each year
#We make the alluvial plot a function: plot_allvial as shown above
<-plot_alluvial(2008)
p1<-plot_alluvial(2009)
p2<-plot_alluvial(2010)
p3<-plot_alluvial(2012)
p5<-plot_alluvial(2013)
p6<-plot_alluvial(2014) p7
###stacked bar
#filter on data_p when making function
#make it a function
<-function(year = 0){
plot_stackedbarif(year == 0){
<- data_p %>%
data_stack group_by(Educational.Attainment, Personal.Income) %>%
summarise(Freq = sum(Population.Count)) %>% ungroup()
else{
}<- data_p %>% filter(Year == year) %>% group_by(Educational.Attainment, Personal.Income) %>%
data_stack summarise(Freq = sum(Population.Count)) %>% ungroup()
}<- rev(brewer.pal(8,'Blues'))
colors mosaic(Personal.Income ~ Educational.Attainment ,direction = c('v','h'),
highlighting_fill = colors,
data_stack, labeling = labeling_border(tl_labels = c(TRUE,TRUE), rot_labels = c(5,0,0,60)))
}
plot_stackedbar()
Do we need draw supposed bar for comparison?
#Over years
plot_stackedbar(2008)
plot_stackedbar(2009)
plot_stackedbar(2010)
plot_stackedbar(2011)
plot_stackedbar(2012)
plot_stackedbar(2013)
plot_stackedbar(2014)
###faceted stacked bar
library(ggmosaic)
<- data_p %>% group_by(Gender, Educational.Attainment, Personal.Income) %>%
data_facet summarise(Freq = sum(Population.Count), Year =Year) %>% ungroup()
<- rev(brewer.pal(8,'Blues'))
colors #mosaic(Personal.Income ~ Gender ,direction = c('v','h'),
#data_facet, highlighting_fill = colors,
#labeling = labeling_border(tl_labels = c(TRUE,TRUE), rot_labels = c(5,0,0,60)))
#Notice that once library ggmosaic is loaded, mosaic can no longer be used
#In ggmosaic, y is in reverse order#in ggmosaic, y is in reverse order. We change the level order before we make a plot
$Personal.Income<- fct_rev(data_facet$Personal.Income)
data_facetggplot(data_facet)+
geom_mosaic(aes(weight = Freq, x = product(Gender), fill = Personal.Income))+
scale_fill_manual("legend", values = c("[75k,inf)" = colors[1], "[50k,75k)" = colors[2],
"[35k,50k)" = colors[3], "[25k,35k)" = colors[4],
"[15k,25k)" = colors[5], "[10k,15k)" = colors[6],
"[5k,10k)" = colors[7], "No Income" = colors[8]))+
facet_wrap(vars(Educational.Attainment))
#Is the gap between rich and poor affected by gender?
ggplot(data_facet)+
geom_mosaic(aes(weight = Freq, x = product(Educational.Attainment), fill = Personal.Income))+
scale_fill_manual("legend", values = c("[75k,inf)" = colors[1], "[50k,75k)" = colors[2],
"[35k,50k)" = colors[3], "[25k,35k)" = colors[4],
"[15k,25k)" = colors[5], "[10k,15k)" = colors[6],
"[5k,10k)" = colors[7], "No Income" = colors[8]))+
facet_wrap(vars(Gender))
#Or maybe we don't need include education factor? In the stackedbar.R
#What can we say?
#Over years
ggplot(data_facet)+
geom_mosaic(aes(weight = Freq, x = product(Educational.Attainment), fill = Personal.Income))+
scale_fill_manual("legend", values = c("[75k,inf)" = colors[1], "[50k,75k)" = colors[2],
"[35k,50k)" = colors[3], "[25k,35k)" = colors[4],
"[15k,25k)" = colors[5], "[10k,15k)" = colors[6],
"[5k,10k)" = colors[7], "No Income" = colors[8]))+
facet_grid(rows = vars(Gender), cols = vars(Year))