今天我们分析一下美国总统的国情咨文,分析一下总统获得的掌声次数和支持率的关系,及国情咨文的词云简单实现。
#载入包和设置环境目录
- library(ggplot2)
- library(tm)
- library(grid)
- library(dplyr)
- library(wordcloud)
- setwd("H:/自媒体/2015-07-07/掌声不代表支持/SOTU")
#整理数据
- corpus <- Corpus(DirSource("H:/自媒体/2015-07-07/掌声不代表支持/SOTU"))#批量读入文本
- txt <- tm_map(corpus,stripWhitespace)#去除空格
- txt <- tm_map(txt,removePunctuation)#去除其他符号
- stopwordS<- c(stopwords('english'),"and","that","the")#去除停用词
- txt<- tm_map(txt,removeWords,stopwordS)
#生成词频矩阵
- tdm<- TermDocumentMatrix(txt,control=list(wordLength=c(1,Inf)))
- m <- as.matrix(tdm)
- sotu<- data.frame(m)
- sotu$sum<- rowSums(sotu)
- sotu<- sotu[order(-sotu$sum),]
- colnames(sotu) <- c("SOTU2009","SOTU2010","SOTU2011","SOTU2012","SOTU2013","SOTU2014","SOTU2015","sum")
- sotu<- data.frame(t(sotu[1,1:7]))#转置
- sotu$year<- 2009:2015
- sotu$approval<- c(64,48,50,45,51,42,46)
#绘制词云
- colors<- brewer.pal(8,"Dark2")
- tt<- data.frame(m)
- remove<- c("applause","and","thats","will")
- tt<- tt[!rownames(tt)%in%remove,]
- par(bg = rgb(red = 242, green = 242, blue = 242, max = 255))
- wordcloud(rownames(tt),tt[,"SOTU2015.txt"],scale = c(5,0.3),min.freq = -Inf, max.words = 100, colors=colors, random.order = F, random.color=T, rot.per = 0.5, font = 2, family = "serif")
生成词云时,如果你使用Rstudio那么请将窗口跳到最大,否则会显示不完全
生成词云时,如果你使用Rstudio那么请将窗口跳到最大,否则会显示不完全
#解除tm包
- detach("package:tm", unload=TRUE)
- library(ggplot2)
这里为什么多了一步,因为tm包和ggplot2包都有一个函数annotate,如果不解除tm,ggplot2的annotate会被tm里的取代,引起下面做图错误。
#绘制折线图
- theme_opts<- theme(panel.background=element_rect(fill=rgb(red = 242, green = 242, blue = 242, max = 255)),
- plot.background=element_rect(fill=rgb(red = 242, green = 242, blue = 242, max = 255)),
- panel.grid.major=element_line(colour=rgb(red = 146, green = 146, blue = 146, max = 255),size=.75),
- panel.border=element_rect(colour=rgb(red = 242, green = 242, blue = 242, max = 255)),
- axis.ticks=element_blank(),
- axis.text.x = element_text(colour="grey20", size=12),
- axis.text.y = element_text(colour="grey20",size=12),
- axis.text.y = element_text(size=13,colour=rgb(red = 74, green = 69, blue = 42, max = 255),face="bold"),
- axis.title.y=element_text(size=11,colour=rgb(red = 74, green = 69, blue = 42, max = 255),face="bold",vjust=1.5),
- axis.title.x=element_text(size=11,colour=rgb(red = 74, green = 69, blue = 42, max = 255),face="bold",vjust=-.5),
- legend.position="none")
- p <- ggplot(sotu,aes(x = year)) +
- geom_line(aes(y=applause,colour="#00bdc4"),size=1.6) +
- geom_line(aes(y=approval,colour="#FD6467"),size=1.6) +
- annotate("text",x=2010,y=100,colour="#FD6467",label="掌声",size=7,fontface="bold")+
- annotate("text",x=2010,y=40,colour="#00bdc4",label="支持率",size=7,fontface="bold")+
- theme_bw() +
- scale_x_continuous(minor_breaks=0,breaks=c(2009,2010,2011,2012,2013,2014,2015)) + #这里空着这竖行次网格线minor_breaks
- ggtitle("") +
- ylab("") +
- xlab("") +
- geom_hline(yintercept=0,size=1.2,colour=rgb(red = 74, green = 69, blue = 42, max = 255)) + #这个能成为做图风格的标志
- theme_opts
- ggsave(file="掌声和支持率.png", width=10, height=8,scale=0.8)
相关数据和代码:http://pan.baidu.com/s/1hqxVg7y 密码:微信索取
关于我们,关注理性与文艺,用数据创作内容性的精致阅读,这里是数据分析挖掘人员与文艺青年的集结地,不做培训,不做鼓吹,只踏踏实实的做一个又一个数据驱动的文章,并设计机器人减轻数据分析的负担,无论你感兴趣还是想参与都可以关注,请加微信公众号大音如霜


雷达卡






京公网安备 11010802022788号







