最优分组,将后续添加
- /*------------------------【变量分组宏】:method=1,等频;method=2,等宽-;method=3,分位数分组------------------------------*/
- /*DSin=待分析数据集,method=分组方法,Nbins=分组数,IVar=分析自变量,DVar=分析因变量,DSout=映射输出*/
- %macro ChcAnalysis(DSin=,method=,Nbins=,IVar=,DVar=,DSout=,p_start=,p_end=,p_interval=);
- proc sort data=&DSin;
- by &IVar;
- run;
- data temp;
- set &DSin;
- by &IVar;
- _obs = _n_ ;
- keep &IVar &DVar _obs;
- run;
- %if &method = 1 %then %do;/* 基于等频分组,即根据观测obs分组*/
- proc sql noprint;
- select count(&DVar) into:N from temp;
- select min(_obs),max(_obs) into: Vmin,:Vmax from temp;
- quit;
- %let Binsize = %sysevalf(&Vmax/&Nbins);
- %let LB_1 = 0;
- %do i = 1 %to &Nbins;
- %let LB_&i = %sysevalf(&LB_1 + (&i-1) * &Binsize);
- %let UB_&i = %sysevalf(&&LB_&i + &Binsize);
- proc sql noprint;
- select sum(&DVar),count(&DVar) into:sum_&i ,:N_&i from temp
- where _obs >= &&LB_&i and _obs < &&UB_&i;/*等频基于观测号,划分*/
- quit;
- %end;
- %end;
- %else %if &method = 2 %then %do;/* 根据自变量的取值范围,进行等宽分组*/
- proc sql noprint;
- select count(&DVar) into:N from temp;
- select min(&IVar),max(&IVar) into: Vmin,:Vmax from temp;
- /*PS:无论是存储观测的最大最小值,还是自变量的最大最小值,都要存储到相同的宏变量里,因为后续引用名称相同*/
- quit;
- %let Binsize = %sysevalf((&Vmax-&Vmin)/&Nbins);
- %let LB_1 = 0;
- %do i = 1 %to &Nbins;
- %let LB_&i = %sysevalf(&LB_1 + (&i-1) * &Binsize);
- %let UB_&i = %sysevalf(&&LB_&i + &Binsize);
- proc sql noprint;
- select sum(&DVar),count(&DVar) into:sum_&i ,:N_&i from temp
- where &IVar >= &&LB_&i and &IVar < &&UB_&i;/*等宽基于自变量值,划分*/
- quit;
- %end;
- %end;
- %else %if &method = 3 %then %do;/* 分位数分组,需要进一步改进*/
- /* 根据univariate过程,存储分位数数据集*/
- proc univariate data = &DSin;
- var &IVar ;
- output out =temp_qt
- pctlpts = &p_start to &p_end by &p_interval
- pctlpre = P_;
- run;
-
- /* 对分位数数据集转置*/
- proc transpose data=temp_qt
- out=qt_dsout_trans(rename = (col1=value) drop=_LABEL_ )
- NAME = Quantiles;
- run;
- /* 将一列数据存为宏变量 */
- proc sql;
- select quantiles into: qtl separated by ' ' from qt_dsout_trans;
- select value into:value separated by ' ' from qt_dsout_trans;
- quit;
-
- /* 利用scan函数,将对value宏变量进行分割,另存为单个数值*/
- %let n = %sysevalf((&p_end - &p_start)/&p_interval + 1);
- %do j = 1 %to &n ;
- %let var&j = %sysfunc(trim(%scan(&value,&j,' ')));
- %put &&var&j;
- %end;
- /* 创建分组结果集*/
- proc sql ;
- create table &DSout(n num,bin_l num,bin_u num );
- %do i = 1 %to 10;
- %let bin_l&i = &&var&i;
- %let k = %eval(&i+1);
- %let bin_u&k = &&var&k;
- insert into &dsout values
- (&i,&&bin_l&i,&&bin_u&k );
- %end;
- quit;
- %end;
- /*将划分好的变量进行分组映射*/
- data &DSout;
- %do i=1 %to &NBins;
- Bin = &i;
- Lowerbound = &&LB_&i;
- upperbound = &&UB_&i;
- if (&&sum_&i = .) then N_1 = 0 ;else N_1 = &&sum_&i ;
- if (&&N_&i = . ) then Bintotal = 0 ;else Bintotal = &&N_&i;
- N_0 = Bintotal - N_1 ;
- percent_1 = N_1 / Bintotal;
- percent_0 = N_0 / Bintotal;
- output;
- %end;
- run;
-
- %mend;