raw_data是原始数据集,destination是分类变量;
关于数值型变量的信息熵算法有空继续;
- %macro entropy(row_data,destination);
- proc contents data=&row_data(drop=&destination) noprint out=var_list;run;
- data var_cha_list;
- set var_list;
- where type=2;
- keep name;
- run;
- /*first we need to get the character variable which we can calculate the entropy*/
- %if %sysfunc(exist(var_cha_list)) ne 0 %then %do;
- proc datasets lib=work nolist;
- delete entropy_variable;
- quit;
- data entropy_variable;
- length variable $40 entropy 8;
- stop;
- run;
- %end;
- /*create the table entropy_variable to store the results*/
- %let dsid = %sysfunc(open(var_cha_list));
- %if &dsid gt 0 %then %do;
- %let nobs = %sysfunc(attrn(&dsid,nobs));
- %do i=1 %to &nobs;/*start resolve this question, get the answers*/
- %let rc = %sysfunc(fetch(&dsid,&i));
- %let varnume = %sysfunc(varnum(&dsid,name));
- %let variable = %sysfunc(getvarc(&dsid,&varnume));
- ods html close;
- ods output
- CrossTabFreqs=pro;
- proc freq data=&row_data;
- table &variable * &destination;
- run;
- ods output close;
- ods html;
- /*use the freq proc to summary the data we need*/
- data pro_temp(keep=&variable percent);
- set pro(keep=&variable percent &destination);
- where &destination='' and &variable^='';
- run;
- data pro_temp_2(keep=&variable rowpercent);
- set pro(keep=&variable rowpercent &destination);
- where &destination^='' and &variable^='';
- run;
- proc sql;
- create table final as
- select * from pro_temp as a
- inner join pro_temp_2 as b
- on a.&variable.=b.&variable.;
- quit;
- /*organize the data so we can use it directly*/
- data final;
- set final;
- retain entropy 0;
- if rowpercent^=0 and rowpercent^=100 then
- entropy=entropy+rowpercent/100*log2(rowpercent/100)*percent/100;
- else entropy=entropy;
- run;
- /*calculate the entropy*/
- data entropy_&variable(keep=variable entropy);
- retain variable;
- set final end=eof;
- variable="&variable";
- entropy=-entropy;
- if eof=1;
- run;
- proc append base=entropy_variable data=entropy_&variable force;run; /*get all the answer together*/
- %end;
- %let dsid = %sysfunc(close(&dsid));
- %end;
- /*Worte by Albert.feng , if you use this code please note.*/
- %mend entropy;
- %entropy(c,play_golf);