根据福布斯最近发布的全球明星收入top100的数据,我们要分析几个问题:1)收入和年龄的关系;2)各国总收入和人次比例;3)男女比例;4)各行业summary数据
载入包
- library(reshape2)
- library(plyr)
- library(ggplot2)
读入数据
- star <- read.csv("H:/<span lang="EN-US" style="font-family:宋体;mso-ascii-font-family:Consolas;mso-fareast-font-family:
- 宋体;mso-fareast-theme-font:minor-fareast;mso-hansi-font-family:Consolas;
- mso-bidi-font-family:" times="" new="" roman";mso-bidi-theme-font:minor-bidi"="">自媒体/2015-07-04/<span lang="EN-US" style="font-family:宋体;mso-ascii-font-family:
- Consolas;mso-fareast-font-family:宋体;mso-fareast-theme-font:minor-fareast;
- mso-hansi-font-family:Consolas;mso-bidi-font-family:" times="" new="" roman";="" mso-bidi-theme-font:minor-bidi"="">明星收入分析.csv",
- header = T, sep = ",", stringsAsFactor = F)
如果不是特别需要读取数据时最好设置stringsAsFactor参数为FALSE,因为一旦默认将字符转化为了因子你可能会碰到各种报错,当你碰到factor字符出现的错误时你就要考虑一下自己是不是将因子当成字符了。
绘制收入和年龄的关系图
- theme_opts <- list(theme(panel.grid.major.y
- = element_line(colour=rgb(red = 146, green
- = 146, blue
- = 146, max
- = 255), size
- =1),
- panel.grid.major.x = element_line(colour=rgb(red = 146, green = 146, blue = 146, max = 255), size =1),
- plot.background = element_rect(fill=rgb(red = 242, green = 242, blue = 242, max = 255)),#<span lang="EN-US" style="font-family:宋体;mso-ascii-font-family:Consolas;mso-fareast-font-family:
- 宋体;mso-fareast-theme-font:minor-fareast;mso-hansi-font-family:Consolas;
- mso-bidi-font-family:" times="" new="" roman";mso-bidi-theme-font:minor-bidi"="">设置整幅图的背景
- panel.border = element_blank(),
- panel.background = element_rect(fill=rgb(red = 242, green = 242, blue = 242, max = 255)),#<span lang="EN-US" style="font-family:宋体;mso-ascii-font-family:Consolas;mso-fareast-font-family:
- 宋体;mso-fareast-theme-font:minor-fareast;mso-hansi-font-family:Consolas;
- mso-bidi-font-family:" times="" new="" roman";mso-bidi-theme-font:minor-bidi"="">设置绘图区的颜色
- axis.text.x
- = element_text(colour="grey20",
- size=12),
- axis.text.y = element_text(colour="grey20",
- size=12),
- axis.ticks.x = element_blank(),
- axis.ticks.y = element_line(size =1),#<span lang="EN-US" style="font-family:宋体;mso-ascii-font-family:Consolas;mso-fareast-font-family:
- 宋体;mso-fareast-theme-font:minor-fareast;mso-hansi-font-family:Consolas;
- mso-bidi-font-family:" times="" new="" roman";mso-bidi-theme-font:minor-bidi"="">设置y<span lang="EN-US" style="font-family:宋体;mso-ascii-font-family:Consolas;mso-fareast-font-family:
- 宋体;mso-fareast-theme-font:minor-fareast;mso-hansi-font-family:Consolas;
- mso-bidi-font-family:" times="" new="" roman";mso-bidi-theme-font:minor-bidi"="">轴坐标轴刻度线的粗细
- #axis.text.y = element_blank(),#<span lang="EN-US" style="font-family:宋体;mso-ascii-font-family:
- Consolas;mso-fareast-font-family:宋体;mso-fareast-theme-font:minor-fareast;
- mso-hansi-font-family:Consolas;mso-bidi-font-family:" times="" new="" roman";="" mso-bidi-theme-font:minor-bidi"="">设置y<span lang="EN-US" style="font-family:宋体;mso-ascii-font-family:Consolas;mso-fareast-font-family:
- 宋体;mso-fareast-theme-font:minor-fareast;mso-hansi-font-family:Consolas;
- mso-bidi-font-family:" times="" new="" roman";mso-bidi-theme-font:minor-bidi"="">轴标签为空
- #axis.text.x = element_blank(),
- axis.title.y = element_blank(),
- axis.title.x = element_blank()))
- cols <- c(rgb(red = 0, green = 130, blue = 137, max = 255),
-
- rgb(red = 252, green = 102, blue = 129, max = 255),
-
- rgb(red = 120, green = 81, blue = 76, max = 255))
- temp <- star[!is.na(star$Age),]
- p <- ggplot(star,aes(x =
- Age,y = Earnings))
- p <- p + geom_point(aes(colour =
- sex), size = 8, alpha = 0.8) + scale_colour_manual(values = cols)#<span lang="EN-US" style="font-family:宋体;mso-ascii-font-family:
- Consolas;mso-fareast-font-family:宋体;mso-fareast-theme-font:minor-fareast;
- mso-hansi-font-family:Consolas;mso-bidi-font-family:" times="" new="" roman";="" mso-bidi-theme-font:minor-bidi"="">设置散点的大小颜色
- p <- p + coord_cartesian(xlim = c(20, 75))
- p <- p + scale_x_continuous(breaks = seq(20, 75,by = 5))
- #p <- p+geom_text(aes(label = state_abbrev),alpha
- = 0.3, jitter = TRUE)#jitter<span lang="EN-US" style="font-family:宋体;mso-ascii-font-family:Consolas;mso-fareast-font-family:
- 宋体;mso-fareast-theme-font:minor-fareast;mso-hansi-font-family:Consolas;
- mso-bidi-font-family:" times="" new="" roman";mso-bidi-theme-font:minor-bidi"="">参数设置随机移动一点远距离避免重合
- p <- p + geom_hline(yintercept=50, size = 1.5, color = rgb(red = 253, green = 107, blue = 117, max = 255), alpha = 0.8)
- #p <- p +
- geom_smooth(method="loess",span=1.05,se=FALSE, color = rgb(red = 0,
- green = 137, blue = 130, max = 255), size =1)
- p <- p + guides(colour=FALSE)
- p <- p + theme_opts
从年龄上分析,收入前一百名明星中多集中在20-50岁以下,尤其是女性群体,演艺和竞技圈确实不属于高龄人群统治的领域,毫无疑问这是一个靠脸(体力)吃饭的领域,而我国中医行业恰恰相反,中医必须老龄化才吃得开,真正是一个疗效只看皱纹的行当,当然前提是如果有疗效的话。
绘制性别饼图
- temp <- aggregate(Earnings
- ~ sex, data
- = star, sum)
- percent <- function(x, digits = 2, format = "f",
- ...) {
- paste0(formatC(100 * x, format =
- format, digits = digits, ...), "%")
- }
- theme_opts <- list(theme(panel.grid=element_blank(),
- plot.background = element_rect(fill=rgb(red = 242, green = 242, blue = 242, max = 255)),#<span lang="EN-US" style="font-family:宋体;mso-ascii-font-family:Consolas;mso-fareast-font-family:
- 宋体;mso-fareast-theme-font:minor-fareast;mso-hansi-font-family:Consolas;
- mso-bidi-font-family:" times="" new="" roman";mso-bidi-theme-font:minor-bidi"="">设置整幅图的背景
- panel.border = element_blank(),
- panel.background = element_rect(fill=rgb(red = 242, green = 242, blue = 242, max = 255)),#<span lang="EN-US" style="font-family:宋体;mso-ascii-font-family:Consolas;mso-fareast-font-family:
- 宋体;mso-fareast-theme-font:minor-fareast;mso-hansi-font-family:Consolas;
- mso-bidi-font-family:" times="" new="" roman";mso-bidi-theme-font:minor-bidi"="">设置绘图区的颜色
- axis.text.x = element_text(colour="grey20",
- size=12),
- axis.text.y = element_text(colour="grey20",
- size=12),
- axis.ticks = element_blank(),
- axis.text.y
- = element_blank(),#<span lang="EN-US" style="font-family:宋体;mso-ascii-font-family:
- Consolas;mso-fareast-font-family:宋体;mso-fareast-theme-font:minor-fareast;
- mso-hansi-font-family:Consolas;mso-bidi-font-family:" times="" new="" roman";="" mso-bidi-theme-font:minor-bidi"="">设置y<span lang="EN-US" style="font-family:宋体;mso-ascii-font-family:Consolas;mso-fareast-font-family:
- 宋体;mso-fareast-theme-font:minor-fareast;mso-hansi-font-family:Consolas;
- mso-bidi-font-family:" times="" new="" roman";mso-bidi-theme-font:minor-bidi"="">轴标签为空
- #axis.text.x = element_blank(),
- axis.title.y = element_blank(),
- axis.title.x = element_blank()))
- temp <- aggregate(Earnings
- ~ sex, data = star, sum)
- cols <- c(rgb(red = 0, green = 130, blue = 137, max = 255),
-
- rgb(red = 252, green = 102, blue = 129, max = 255),
-
- rgb(red = 120, green = 81, blue = 76, max = 255))
- p <- ggplot(temp,
- aes(x = "",
- y =
- Earnings, fill = sex))+
- geom_bar(width = 1, stat = "identity")
- p <- p + coord_polar("y",
- start=0)
- p <- p + scale_fill_manual(values = cols)
- p <- p + geom_text(aes(y =
- Earnings/3 + c(0,cumsum(Earnings)[-length(Earnings)]),
-
- label = percent(Earnings/sum(temp$Earnings), digits
- = 2)), size=25, color = "white")#<span lang="EN-US" style="font-family:宋体;mso-ascii-font-family:Consolas;mso-fareast-font-family:
- 宋体;mso-fareast-theme-font:minor-fareast;mso-hansi-font-family:Consolas;
- mso-bidi-font-family:" times="" new="" roman";mso-bidi-theme-font:minor-bidi"="">添加百分比
- p <- p + guides(fill=FALSE)
- p <- p + theme_opts
- p <- p + theme(axis.text.x=element_blank())
这里需要注意一点ggplot默认情况下(stat = "bin")bar的高指的是观测值个数,所以你要指具体的数值需要设置为identity。pie图不是很满意有空再修改。 收入前一百名中,男性收入占总收入的75.29%,尽管近年来女性地位提升很高,也许在精神世界的地位更高,但好像每一个行业实际上仍然是男性主导者一切。
[size=14.6666669845581px]相关数据和代码:http://pan.baidu.com/s/1g6kQa 密码:微信索取
关于我们,关注理性与文艺,用数据创作内容性的精致阅读,这里是数据分析挖掘人员与文艺青年的集结地,不做培训,不做鼓吹,只踏踏实实的做一个又一个数据驱动的文章,并设计机器人减轻数据分析的负担,无论你感兴趣还是想参与都可以关注,请加微信公众号大音如霜


雷达卡






京公网安备 11010802022788号







