人大经济论坛 › 论坛 › 金融投资论坛六区 › 金融学（理论版） › 量化投资 › 基于R语言的文本挖掘tm和Rwordseg包的说明资料分享

CDA数据分析研究院

商业数据分析与大数据领航教育品牌



经管云课堂

经管/金融/财会/社科/名师公开课



学术培训

Stata 空间计量 SSCI Python

贵宾：通行论坛特权+数据库权限
+案例库+下载特权 VIP：论坛特权+更多下载次数
+ccerdata数据库+更高阅读权限+……

返回列表

12 3 4 5 6 7 8 9 下一页

发帖

楼主: fantuanxiaot

11700 80

[源码分享] 基于R语言的文本挖掘tm和Rwordseg包的说明资料分享 [推广有奖]

14关注
289
粉丝

Ψ▄┳一大卫卍卐席尔瓦

大师

还不是VIP/贵宾

威望: 7 级
论坛币: -234475 个
通用积分: 124.0224
学术水平: 3783 点
热心指数: 3819 点
信用等级: 3454 点
经验: 150207 点
帖子: 7546
精华: 32
在线时间: 1327 小时
注册时间: 2013-2-3
最后登录: 2022-2-24

fantuanxiaot 发表于 2015-1-23 11:08:52 |显示全部楼层 |坛友微信交流群

相似文件

换一批

是否 +2 论坛币

k人参与回答

经管之家送您一份

应届毕业生专属福利!

求职就业群

赵安豆老师微信：zhaoandou666

经管之家联合CDA

送您一个全额奖学金名额~ !

立即领取

感谢您参与论坛问题回答

经管之家送您两个论坛币！

+2 论坛币

资料如下

本帖隐藏的内容

extensions.pdf (153.94 KB)

tm.pdf (168.46 KB)

Rwordseg_Vignette_CN.pdf (315.88 KB)

代码诸如：

本帖隐藏的内容

### R code from vignette source 'tm.Rnw'
### Encoding: UTF-8
###################################################
### code chunk number 1: Init
###################################################
library("tm")
data("crude")
###################################################
### code chunk number 2: Ovid
###################################################
txt <- system.file("texts", "txt", package = "tm")
(ovid <- Corpus(DirSource(txt, encoding = "UTF-8"),
readerControl = list(language = "lat")))
###################################################
### code chunk number 3: VectorSource
###################################################
docs <- c("This is a text.", "This another one.")
Corpus(VectorSource(docs))
###################################################
### code chunk number 4: Reuters
###################################################
reut21578 <- system.file("texts", "crude", package = "tm")
reuters <- Corpus(DirSource(reut21578),
readerControl = list(reader = readReut21578XML))
###################################################
### code chunk number 5: tm.Rnw:120-121 (eval = FALSE)
###################################################
## writeCorpus(ovid)
###################################################
### code chunk number 6: tm.Rnw:132-133
###################################################
inspect(ovid[1:2])
###################################################
### code chunk number 7: tm.Rnw:137-138
###################################################
identical(ovid[[2]], ovid[["ovid_2.txt"]])
###################################################
### code chunk number 8: tm.Rnw:156-157
###################################################
reuters <- tm_map(reuters, as.PlainTextDocument)
###################################################
### code chunk number 9: tm.Rnw:165-166
###################################################
reuters <- tm_map(reuters, stripWhitespace)
###################################################
### code chunk number 10: tm.Rnw:171-172
###################################################
reuters <- tm_map(reuters, tolower)
###################################################
### code chunk number 11: Stopwords
###################################################
reuters <- tm_map(reuters, removeWords, stopwords("english"))
###################################################
### code chunk number 12: Stemming
###################################################
tm_map(reuters, stemDocument)
###################################################
### code chunk number 13: tm.Rnw:204-206
###################################################
query <- "id == '237' & heading == 'INDONESIA SEEN AT CROSSROADS OVER ECONOMIC CHANGE'"
tm_filter(reuters, FUN = sFilter, query)
###################################################
### code chunk number 14: DublinCore
###################################################
DublinCore(crude[[1]], "Creator") <- "Ano Nymous"
meta(crude[[1]])
###################################################
### code chunk number 15: tm.Rnw:237-241
###################################################
meta(crude, tag = "test", type = "corpus") <- "test meta"
meta(crude, type = "corpus")
meta(crude, "foo") <- letters[1:20]
meta(crude)
###################################################
### code chunk number 16: tm.Rnw:258-260
###################################################
dtm <- DocumentTermMatrix(reuters)
inspect(dtm[1:5,100:105])
###################################################
### code chunk number 17: tm.Rnw:269-270
###################################################
findFreqTerms(dtm, 5)
###################################################
### code chunk number 18: tm.Rnw:275-276
###################################################
findAssocs(dtm, "opec", 0.8)
###################################################
### code chunk number 19: tm.Rnw:288-289
###################################################
inspect(removeSparseTerms(dtm, 0.4))
###################################################
### code chunk number 20: tm.Rnw:303-305
###################################################
inspect(DocumentTermMatrix(reuters,
list(dictionary = c("prices", "crude", "oil"))))

复制代码

### R code from vignette source 'extensions.Rnw'
###################################################
### code chunk number 1: Init
###################################################
library("tm")
library("XML")
###################################################
### code chunk number 2: extensions.Rnw:71-76
###################################################
VecSource <- function(x) {
s <- Source(length = length(x), names = names(x), class = "VectorSource")
s$Content <- as.character(x)
s
}
###################################################
### code chunk number 3: extensions.Rnw:85-89
###################################################
getElem.VectorSource <-
function(x) list(content = x$Content[x$Position], uri = NA)
pGetElem.VectorSource <-
function(x) lapply(x$Content, function(y) list(content = y, uri = NA))
###################################################
### code chunk number 4: extensions.Rnw:114-117
###################################################
readPlain <-
function(elem, language, id)
PlainTextDocument(elem$content, id = id, language = language)
###################################################
### code chunk number 5: extensions.Rnw:145-150
###################################################
df <- data.frame(contents = c("content 1", "content 2", "content 3"),
title = c("title 1" , "title 2" , "title 3" ),
authors = c("author 1" , "author 2" , "author 3" ),
topics = c("topic 1" , "topic 2" , "topic 3" ),
stringsAsFactors = FALSE)
###################################################
### code chunk number 6: extensions.Rnw:156-157
###################################################
names(attributes(PlainTextDocument()))
###################################################
### code chunk number 7: Mapping
###################################################
m <- list(Content = "contents", Heading = "title",
Author = "authors", Topic = "topics")
###################################################
### code chunk number 8: myReader
###################################################
myReader <- readTabular(mapping = m)
###################################################
### code chunk number 9: extensions.Rnw:180-181
###################################################
(corpus <- Corpus(DataframeSource(df), readerControl = list(reader = myReader)))
###################################################
### code chunk number 10: extensions.Rnw:186-188
###################################################
corpus[[1]]
meta(corpus[[1]])
###################################################
### code chunk number 11: CustomXMLFile
###################################################
custom.xml <- system.file("texts", "custom.xml", package = "tm")
print(readLines(custom.xml), quote = FALSE)
###################################################
### code chunk number 12: mySource
###################################################
mySource <- function(x, encoding = "UTF-8")
XMLSource(x, function(tree) XML::xmlChildren(XML::xmlRoot(tree)), myXMLReader, encoding)
###################################################
### code chunk number 13: myXMLReader
###################################################
myXMLReader <- readXML(
spec = list(Author = list("node", "/document/writer"),
Content = list("node", "/document/description"),
DateTimeStamp = list("function",
function(x) as.POSIXlt(Sys.time(), tz = "GMT")),
Description = list("attribute", "/document/@short"),
Heading = list("node", "/document/caption"),
ID = list("function", function(x) tempfile()),
Origin = list("unevaluated", "My private bibliography"),
Type = list("node", "/document/type")),
doc = PlainTextDocument())
###################################################
### code chunk number 14: extensions.Rnw:301-302
###################################################
corpus <- Corpus(mySource(custom.xml))
###################################################
### code chunk number 15: extensions.Rnw:306-308
###################################################
corpus[[1]]
meta(corpus[[1]])