- 阅读权限
- 255
- 威望
- 0 级
- 论坛币
- 49955 个
- 通用积分
- 79.3687
- 学术水平
- 253 点
- 热心指数
- 300 点
- 信用等级
- 208 点
- 经验
- 41518 点
- 帖子
- 3256
- 精华
- 14
- 在线时间
- 766 小时
- 注册时间
- 2006-5-4
- 最后登录
- 2022-11-6
|
- ## ----word_count----------------------------------------------------------
- freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
- head(freq, 14)
- wf <- data.frame(word=names(freq), freq=freq)
- head(wf)
- ## ----plot_freq, fig.width=12, out.width="\\textwidth"--------------------
- library(ggplot2)
- subset(wf, freq>500) %>%
- ggplot(aes(word, freq)) +
- geom_bar(stat="identity") +
- theme(axis.text.x=element_text(angle=45, hjust=1))
- ## ----wordcloud, echo=FALSE, warning=FALSE, message=FALSE, out.width="0.75\\textwidth", crop=TRUE----
- library(wordcloud)
- set.seed(123)
- wordcloud(names(freq), freq, min.freq=40)
- ## ----wordcloud, eval=FALSE-----------------------------------------------
- ## library(wordcloud)
- ## set.seed(123)
- ## wordcloud(names(freq), freq, min.freq=40)
- ## ----wordcloud_max_words, echo=FALSE, out.width="0.75\\textwidth", crop=TRUE----
- set.seed(142)
- wordcloud(names(freq), freq, max.words=100)
- ## ----wordcloud_max_words, eval=FALSE-------------------------------------
- ## set.seed(142)
- ## wordcloud(names(freq), freq, max.words=100)
- ## ----wordcloud_higher_freq, echo=FALSE, out.width="0.75\\textwidth", crop=TRUE----
- set.seed(142)
- wordcloud(names(freq), freq, min.freq=100)
- ## ----wordcloud_higher_freq, eval=FALSE-----------------------------------
- ## set.seed(142)
- ## wordcloud(names(freq), freq, min.freq=100)
- ## ----wordcloud_colour, echo=FALSE, out.width="0.75\\textwidth", crop=TRUE----
- set.seed(142)
- wordcloud(names(freq), freq, min.freq=100, colors=brewer.pal(6, "Dark2"))
- ## ----wordcloud_colour, eval=FALSE----------------------------------------
- ## set.seed(142)
- ## wordcloud(names(freq), freq, min.freq=100, colors=brewer.pal(6, "Dark2"))
- ## ----wordcloud_scale, echo=FALSE, warning=FALSE, out.width="0.75\\textwidth", crop=TRUE----
- set.seed(142)
- wordcloud(names(freq), freq, min.freq=100, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))
- ## ----wordcloud_scale, eval=FALSE-----------------------------------------
- ## set.seed(142)
- ## wordcloud(names(freq), freq, min.freq=100, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))
- ## ----wordcloud_rotate, echo=FALSE, warning=FALSE, out.width="0.75\\textwidth", crop=TRUE----
- set.seed(142)
- dark2 <- brewer.pal(6, "Dark2")
- wordcloud(names(freq), freq, min.freq=100, rot.per=0.2, colors=dark2)
- ## ----wordcloud_rotate, eval=FALSE----------------------------------------
- ## set.seed(142)
- ## dark2 <- brewer.pal(6, "Dark2")
- ## wordcloud(names(freq), freq, min.freq=100, rot.per=0.2, colors=dark2)
- ## ----library_qdap, echo=FALSE, messages=FALSE----------------------------
- library(qdap)
- ## ----qdap_create_word_list-----------------------------------------------
- words <- dtm %>%
- as.matrix %>%
- colnames %>%
- (function(x) x[nchar(x) < 20])
- ## ----qdap_word_length, out.lines=11--------------------------------------
- length(words)
- head(words, 15)
- summary(nchar(words))
- table(nchar(words))
- dist_tab(nchar(words))
- ## ----qdap_word_length_plot, echo=FALSE, fig.height=4---------------------
- data.frame(nletters=nchar(words)) %>%
- ggplot(aes(x=nletters)) +
- geom_histogram(binwidth=1) +
- geom_vline(xintercept=mean(nchar(words)),
- colour="green", size=1, alpha=.5) +
- labs(x="Number of Letters", y="Number of Words")
- ## ----qdap_word_length_plot, eval=FALSE-----------------------------------
- ## data.frame(nletters=nchar(words)) %>%
- ## ggplot(aes(x=nletters)) +
- ## geom_histogram(binwidth=1) +
- ## geom_vline(xintercept=mean(nchar(words)),
- ## colour="green", size=1, alpha=.5) +
- ## labs(x="Number of Letters", y="Number of Words")
- ## ----qdap_letter_freq_plot, echo=FALSE, fig.height=4---------------------
- library(dplyr)
- library(stringr)
- words %>%
- str_split("") %>%
- sapply(function(x) x[-1]) %>%
- unlist %>%
- dist_tab %>%
- mutate(Letter=factor(toupper(interval),
- levels=toupper(interval[order(freq)]))) %>%
- ggplot(aes(Letter, weight=percent)) +
- geom_bar() +
- coord_flip() +
- ylab("Proportion") +
- scale_y_continuous(breaks=seq(0, 12, 2),
- label=function(x) paste0(x, "%"),
- expand=c(0,0), limits=c(0,12))
- ## ----qdap_letter_freq_plot, eval=FALSE-----------------------------------
- ## library(dplyr)
- ## library(stringr)
- ##
- ## words %>%
- ## str_split("") %>%
- ## sapply(function(x) x[-1]) %>%
- ## unlist %>%
- ## dist_tab %>%
- ## mutate(Letter=factor(toupper(interval),
- ## levels=toupper(interval[order(freq)]))) %>%
- ## ggplot(aes(Letter, weight=percent)) +
- ## geom_bar() +
- ## coord_flip() +
- ## ylab("Proportion") +
- ## scale_y_continuous(breaks=seq(0, 12, 2),
- ## label=function(x) paste0(x, "%"),
- ## expand=c(0,0), limits=c(0,12))
- ## ----qdap_count_position_plot, echo=FALSE, fig.height=7, fig.width=9-----
- words %>%
- lapply(function(x) sapply(letters, gregexpr, x, fixed=TRUE)) %>%
- unlist %>%
- (function(x) x[x!=-1]) %>%
- (function(x) setNames(x, gsub("\\d", "", names(x)))) %>%
- (function(x) apply(table(data.frame(letter=toupper(names(x)),
- position=unname(x))),
- 1, function(y) y/length(x))) %>%
- qheat(high="green", low="yellow", by.column=NULL,
- values=TRUE, digits=3, plot=FALSE) +
- ylab("Letter") +
- xlab("Position") +
- theme(axis.text.x=element_text(angle=0)) +
- guides(fill=guide_legend(title="Proportion"))
- ## ----qdap_count_position_plot, eval=FALSE--------------------------------
- ## words %>%
- ## lapply(function(x) sapply(letters, gregexpr, x, fixed=TRUE)) %>%
- ## unlist %>%
- ## (function(x) x[x!=-1]) %>%
- ## (function(x) setNames(x, gsub("\\d", "", names(x)))) %>%
- ## (function(x) apply(table(data.frame(letter=toupper(names(x)),
- ## position=unname(x))),
- ## 1, function(y) y/length(x))) %>%
- ## qheat(high="green", low="yellow", by.column=NULL,
- ## values=TRUE, digits=3, plot=FALSE) +
- ## ylab("Letter") +
- ## xlab("Position") +
- ## theme(axis.text.x=element_text(angle=0)) +
- ## guides(fill=guide_legend(title="Proportion"))
- ## ----eval=FALSE----------------------------------------------------------
- ## devtools::install_github("lmullen/gender-data-pkg")
- ## ------------------------------------------------------------------------
- name2sex(qcv(graham, frank, leslie, james, jacqui, jack, kerry, kerrie))
- ## ----review_prepare_corpus, eval=FALSE-----------------------------------
- ## # Required packages
- ##
- ## library(tm)
- ## library(wordcloud)
- ##
- ## # Locate and load the Corpus.
- ##
- ## cname <- file.path(".", "corpus", "txt")
- ## docs <- Corpus(DirSource(cname))
- ##
- ## docs
- ## summary(docs)
- ## inspect(docs[1])
- ##
- ## # Transforms
- ##
- ## toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
- ## docs <- tm_map(docs, toSpace, "/|@|\\|")
- ##
- ## docs <- tm_map(docs, content_transformer(tolower))
- ## docs <- tm_map(docs, removeNumbers)
- ## docs <- tm_map(docs, removePunctuation)
- ## docs <- tm_map(docs, removeWords, stopwords("english"))
- ## docs <- tm_map(docs, removeWords, c("own", "stop", "words"))
- ## docs <- tm_map(docs, stripWhitespace)
- ##
- ## toString <- content_transformer(function(x, from, to) gsub(from, to, x))
- ## docs <- tm_map(docs, toString, "specific transform", "ST")
- ## docs <- tm_map(docs, toString, "other specific transform", "OST")
- ##
- ## docs <- tm_map(docs, stemDocument)
- ##
- ## ----review_analyse_corpus, eval=FALSE-----------------------------------
- ## # Document term matrix.
- ##
- ## dtm <- DocumentTermMatrix(docs)
- ## inspect(dtm[1:5, 1000:1005])
- ##
- ## # Explore the corpus.
- ##
- ## findFreqTerms(dtm, lowfreq=100)
- ## findAssocs(dtm, "data", corlimit=0.6)
- ##
- ## freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
- ## wf <- data.frame(word=names(freq), freq=freq)
- ##
- ## library(ggplot2)
- ##
- ## p <- ggplot(subset(wf, freq>500), aes(word, freq))
- ## p <- p + geom_bar(stat="identity")
- ## p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
- ##
- ## # Generate a word cloud
- ##
- ## library(wordcloud)
- ## wordcloud(names(freq), freq, min.freq=100, colors=brewer.pal(6, "Dark2"))
- ## ----common_outtro, child="finale.Rnw", eval=TRUE------------------------
- ## ----syinfo, child="sysinfo.Rnw", eval=TRUE------------------------------
- ## ----echo=FALSE, message=FALSE-------------------------------------------
- require(Hmisc)
- pkg <- "knitr"
- pkg.version <- installed.packages()[pkg, 'Version']
- pkg.date <- installed.packages(fields="Date")[pkg, 'Date']
- pkg.info <- paste(pkg, pkg.version, pkg.date)
- rev <- system("bzr revno", intern=TRUE)
- cpu <- system(paste("cat /proc/cpuinfo | grep 'model name' |",
- "head -n 1 | cut -d':' -f2"), intern=TRUE)
- ram <- system("cat /proc/meminfo | grep MemTotal: | awk '{print $2}'",
- intern=TRUE)
- ram <- paste0(round(as.integer(ram)/1e6, 1), "GB")
- user <- Sys.getenv("LOGNAME")
- node <- Sys.info()[["nodename"]]
- user.node <- paste0(user, "@", node)
- gcc.version <- system("g++ -v 2>&1 | grep 'gcc version' | cut -d' ' -f1-3",
- intern=TRUE)
- os <- system("lsb_release -d | cut -d: -f2 | sed 's/^[ \t]*//'", intern=TRUE)
复制代码
|
|