train.rwordseg <- lapply(1:length(txt$Sentence), function(i) segmentCN(txt$Sentence, nature = TRUE))
wordcorpus <- Corpus(VectorSource(train.rwordseg)) #生成语料库
Sys.setlocale(locale="Chinese")
dtm1 <- DocumentTermMatrix(wordcorpus,control = list( wordLengths=c(4, Inf), # to allow long words
bounds = list(global = c(5,Inf)), # each term appears in at least 5 docs
removeNumbers = TRUE, # removePunctuation = list(preserve_intra_word_dashes = FALSE),
weighting = weightTf, encoding = "UTF-8")) #转换数据格式
df_dtm1<-as.data.frame(inspect(dtm1))#将词频矩阵转换为数据框格式
df_dtm1 <- as.matrix(df_dtm1)