情感分析清洗数据的时候,有好几处报度量数目不对,小白一个,之前也没有学过R,请教各位大神,到底是什么原因
> train<- read.csv("C:\\Users\\Administrator\\Desktop\\新建文件夹\\1.csv",quote = "",sep = "\"", header = F,col.names = 'msg', stringsAsFactors = F)
> neg <- read.csv("C:\\Users\\Administrator\\Desktop\\新建文件夹\\neg.csv", header = F, sep = ",", stringsAsFactors = F)
> weight <- rep(-1, length(neg[,1]))
> neg <- cbind(neg, weight)
> pos <- read.csv("C:\\Users\\Administrator\\Desktop\\新建文件夹\\pos.csv", header = F, sep = ",", stringsAsFactors = F)
> weight <- rep(1, length(pos[,1]))
> pos <- cbind(pos, weight)
> posneg <- rbind(pos, neg)
> names(posneg) <- c("term", "weight")
> posneg <- posneg[!duplicated(posneg$term), ]
> dict <- posneg[, "term"]
> library(Rwordseg)
> sentence <- as.vector(train$msg)
> sentence <- gsub("[[:digit:]]*", "", sentence)
> sentence <- gsub("[a-zA-Z]", "", sentence)
> sentence <- gsub("\\.", "", sentence)
> train<- train[!is.na(sentence), ]
> sentence <- sentence[!is.na(sentence)]
>train <- train[!nchar(sentence) < 2, ] #老师这里说量度数目不对,我实在找不到问题是怎么回事了
>sentence <- sentence[!nchar(sentence) < 2]
>system.time(x <- segmentCN(strwords = sentence))
> temp <- lapply(x, length)
> temp <- unlist(temp)
> id <- rep(train[, "id"], temp) #这里也说量度数目不对
> label <- rep(train[, "label"], temp) #这里也是说量度数目不对
> term <- unlist(x)
> testterm <- as.data.frame(cbind(id, term, label), stringsAsFactors = F)
>stopword <- read.csv("C:\\Users\\Administrator\\Desktop\\新建文件夹\\stopword.csv", header = F, sep = ",", stringsAsFactors = F)
> stopword <- stopword[!stopword$term %in% posneg$term,]
> testterm <- testterm[!testterm$term %in% stopword,]