- # Create the data frame
- col1 <- runif (12^5, 0, 2)
- col2 <- rnorm (12^5, 0, 2)
- col3 <- rpois (12^5, 3)
- col4 <- rchisq (12^5, 2)
- df <- data.frame (col1, col2, col3, col4)
逐行判断该数据框 (df) 的总和是否大于 4 ,如果该条件满足,则对应的新变量数值为’greaterthan4’,否则赋值为 ’lesserthan4’。
几种方法的比较:
方法一:
- # Original R code: Before vectorization and pre-allocation
- system.time({
- for (i in 1:nrow(df)) { # for every row
- if ((df[i, 'col1'] + df[i, 'col2'] + df[i, 'col3'] + df[i, 'col4']) > 4) { # check if > 4
- df[i, 5] <- "greater_than_4" # assign 5th column
- } else {
- df[i, 5] <- "lesser_than_4" # assign 5th column
- }
- }
- })
- -------------速度超级慢
方法二:向量化思想
先定义好变量,并先附上初始值;再把判断条件写在外面判定好,最后进入循环。
- output <- c(rep("lesser_than_4", nrow(df)))
- condition <- (df$col1 + df$col2 + df$col3 + df$col4) > 4
- system.time({
- for (i in (1:nrow(df))[condition]) { # run loop only for true conditions
- if (condition[i]) {
- output[i] <- "greater_than_4"
- }
- }
- df$output
- })
方法三:使用ifelse
- system.time({
- output <- ifelse ((df$col1 + df$col2 + df$col3 + df$col4) > 4, "greater_than_4", "lesser_than_4")
- df$output <- output
- })
方法四:使用which语句
- # Thanks to Gabe Becker
- system.time({
- want = which(rowSums(df) > 4)
- output = rep("less than 4", times = nrow(df))
- output[want] = "greater than 4"
- })
方法五:使用apply族函数替代for循环
- # apply family
- system.time({
- myfunc <- function(x) {
- if ((x['col1'] + x['col2'] + x['col3'] + x['col4']) > 4) {
- "greater_than_4"
- } else {
- "lesser_than_4"
- }
- }
- output <- apply(df[, c(1:4)], 1, FUN=myfunc) # apply 'myfunc' on every row
- df$output <- output
- })
总结:大家可以分别用上述方法去尝试下,看下时间。一般而言应尽量避免for,能用隐式循环尽量用;另外,如果要用到for的话,尽量在for循环之间把结果的类型、长度和初始值都定义好,这样能大大减少系统开支。



雷达卡





京公网安备 11010802022788号







