操作代码:
# 清空环境(此步骤为个人习惯,清空当前R环境中的对象,避免混淆)
- rm(list = ls())
# 设置工作路径(将此处路径更换为自己的工作文件夹路径,建议将需要导入的数据直接放入工作文件夹,可省略后续添加工作路径的步骤)
- setwd("E:/Downloads/OneDrive/ProPhet/Codes/RCode/DataClean")
## 加载导入Stata(.dta)格式数据的包haven
- if (! require("haven")) install.packages("haven")
- if (! require("Tidyverse")) install.packages("Tidyverse")
## 导入数据
- dm<-read_dta("demographic_background.dta") #### 人口学特征
- hlt<-read_dta("health_status_and_functioning.dta") #### 健康情况
- bm<-read_dta("biomarkers.dta") #### 体检数据
- bld<-read_dta("Blood_20140429.dta") #### 血检指标
## 筛选变量
- ### 人口学特征
- colnames(dm) #### 查看数据集变量名
- dmslc<-dm %>%
- #### 生成新变量
- mutate(age=2011-ba002_1,
- gender=rgender,
- education=ifelse(bd001<4,"Illiterate",
- ifelse(bd001==4,"Elementary school",
- ifelse(bd001==5,"Middle school","High school and above"))),
- mariage=ifelse(be001<3,"married",
- ifelse(be001==6,"Never married","Others"))) %>%
- #### 筛选变量
- select(ID,age,gender,education,mariage)
- ### 健康情况
- colnames(hlt) #### 查看数据集变量名
- hltslc<-hlt %>%
- #### 生成新变量
- mutate(hyper=ifelse(da007_1_==1,1,0),
- dm=da007_3_,
- sleep=ifelse(da049<5,1,ifelse(da049<=7,2,3))) %>%
- #### 筛选变量
- select(ID,hyper,dm,sleep)
- ### 体检数据
- colnames(bm) #### 查看数据集变量名
- bmslc<-bm %>%
- #### 生成新数据集
- mutate(bmi=ql002/(qi002/100)^2,
- waist=qm002) %>%
- #### 筛选变量
- select(ID,bmi,waist)
- ### 血检数据
- colnames(bld) #### 查看数据集变量名
- bldslc<-bld %>%
- #### 生成新变量
- mutate(BUN=newbun,
- GLU=newglu,
- CREA=newcrea,
- CHO=newcho,
- TG=newtg,
- HDLC=newhdl,
- LDLC=newldl,
- CRP=newcrp,
- UA=newua) %>%
- #### 筛选变量
- select(ID,BUN,GLU,CREA,CHO,TG,HDLC,LDLC,CRP,UA)
- merge.data<-dmslc %>%
- #### 数据合并
- inner_join(hltslc,by="ID") %>% #### inner_join为内连接,仅对两数据集共有样本进行匹配
- left_join(bmslc,by="ID") %>% #### left_join为左连接,保留主数据集全部样本
- full_join(bldslc,by="ID") %>% #### right_join为右链接,保留匹配数据集全部样本
- #### 变量赋值:
- mutate(gender=factor(gender,levels = c(1,2),labels = c("Male","Female")),
- education=factor(education,
- levels = c("Illiterate","Elementary school",
- "Middle school","High school and above")),
- mariage=factor(mariage,levels = c("married","Never married","Others")),
- dm=factor(dm,levels = c(1,2),labels = c("Yes","No")),
- sleep=factor(sleep,levels = c(1,2,3),labels = c("<5h","5-7h",">7h"))) %>% #### 删除缺失值
-
- #### 删除缺失值
- drop_na() %>%
-
- ### 转换为data.frame
- as.data.frame()
## 导出为Excel(.cav)数据
- write.csv(merge.data,"DataClean.csv",fileEncoding = "GB18030")



雷达卡




京公网安备 11010802022788号







