这个宏主要是用proc means求一系列统计量的
后来又加了些我自己做的统计量计算,如半方量、几何平均值和连乘积等
用这类宏的好处就是方便调用,因为SAS各个过程的可自定义的东西比较多,经常忘
我现在能比较熟练使用的过程就proc sql了
- %macro GetStatsForTable(SourceTable,TargetTable,ByFactors,InputVar,InputVarType,OutputVarType,Weight,Statistic);
- /**********************************************************************/
- /* 此宏计算所选变量的统计指标。其中SourceTable是含有所选变量的原始表 */
- /* 格;TargetTable是结果表格;ByFactors是对统计量进行分组研究的分组变 */
- /* 量,没有分组变量时可设置为空;InputVar是进行统计的目标变量,若需要 */
- /* 统计全部变量、全部数值变量和全部字符变量,可分别设为_ALL_、 */
- /* _NUMERIC_和_CHARACTER_;InputVarType是目标变量的类型,当Statistic= */
- /* GMEAN时必须设置,=P表示价格变量,=R表示收益率变量;OutputVarType是 */
- /* 输出变量的类型,=Origin表示不作处理,即输出变量名称等于目标变量的 */
- /* 名称,并用_STAT_变量区分不同的统计量,=Suffix表示输出变量用不同的 */
- /* 统计量作为后缀;Weight是权重变量;Statistic是指定的统计量,=All时 */
- /* 同时计算N|MIN|MAX|MEAN|STD,也可以单独设置,可以设置的统计量如下所 */
- /* 示: */
- /* */
- /* - Descriptive statistics keyword */
- /* CSS RANGE CV SKEW KURT STD LCLM STDERR MAX SUM MEAN GMEAN(Geo- */
- /* metric Mean) GSUM(Continued Product) SUMWGT MIN */
- /* UCLM MODE USS N VAR SVAR(Semi-Variance) NMISS */
- /* - Quantile statistics keyword */
- /* MEDIAN P1 P5 P10 P25 P50 P75 P90 P95 P99 Q1 Q3 QRANGE */
- /* - Hypothesis testing keyword */
- /* PRT T */
- /* */
- /* 最终得到的是含有统计结果的结果表格&TargetTable。 */
- /* */
- /* Created on 2012.12.25 */
- /* Modified on 2013.4.28 */
- /**********************************************************************/
- /* 检查TargetTable的存在性,若不存在则设为&SourceTable */
- %if &TargetTable EQ %STR() %then %let TargetTable=&SourceTable;
- /* 检查ByFactors的存在性 */
- %if &ByFactors NE %STR() %then %do;
- %ChkVar(SourceTable=&SourceTable,InputVar=&ByFactors,FlagVarExists=GSFT_FlagVarExists1);
- %if %SYSFUNC(FIND(&GSFT_FlagVarExists1,0)) NE 0 %then %do;
- %put ERROR: The ByFactors "%SCAN(&ByFactors,%SYSFUNC(FIND(&GSFT_FlagVarExists1,0)))" does not exist in SourceTable, please check it again.;
- %goto exit;
- %end;
- /* 逗号分隔的ByFactors_Comma用于SQL语句 */
- %local ByFactors_Comma;
- %if %SYSFUNC(FIND(&ByFactors,%STR( ))) NE 0 %then %do;
- %let ByFactors_Comma=%SYSFUNC(TRANWRD(%SYSFUNC(COMPBL(&ByFactors)),%STR( ),%STR(,)));
- %end;
- %else %do;
- %let ByFactors_Comma=&ByFactors;
- %end;
- %end;
- /* 检查InputVar的存在性 */
- %if %SYSFUNC(FIND(&InputVar,%STR(:))) NE 0 %then %do;
- %GetVarListForTable(SourceTable=&SourceTable,
- TargetTable=,
- OutputVar=GSFT_InputVar,
- VarType=&InputVar);
- %let InputVar=&GSFT_InputVar;
- %end;
- %if %UPCASE(&InputVar) NE _ALL_ AND %UPCASE(&InputVar) NE _NUMERIC_ AND %UPCASE(&InputVar) NE _CHARACTER_ %then %do;
- %ChkVar(SourceTable=&SourceTable,InputVar=&InputVar,FlagVarExists=GSFT_FlagVarExists2);
- %if %SYSFUNC(FIND(&GSFT_FlagVarExists2,0)) NE 0 %then %do;
- %put ERROR: The InputVar %SCAN(&InputVar,%SYSFUNC(FIND(&GSFT_FlagVarExists2,0))) does not exist in SourceTable, please check it again.;
- %goto exit;
- %end;
- %end;
- /* 检查InputVarType的合法性 */
- %if %UPCASE(&Statistic) EQ GMEAN %then %do;
- %if (%UPCASE(&InputVarType) NE P) AND (%UPCASE(&InputVarType) NE R) %then %do;
- %put ERROR: The InputVarType should be P or R, case insensitive and without quotes.;
- %goto exit;
- %end;
- %end;
- /* 拆分InputVar */
- %SeparateString(InputString=&InputVar,OutputString=GSFT_InputVar);
- /* 检查OutputVarType的合法性 */
- %if &OutputVarType EQ %STR() %then %let OutputVarType=Origin;
- %if (%UPCASE(&OutputVarType) NE ORIGIN) AND (%UPCASE(&OutputVarType) NE SUFFIX) %then %do;
- %put ERROR: The OutputVarType should be Origin or Suffix, case insensitive and without quotes.;
- %goto exit;
- %end;
- /* 检查Weight的唯一性和存在性 */
- %if %SYSFUNC(FIND(&Weight,%STR( ))) NE 0 %then %do;
- %put ERROR: There should be only one Weight, please check it again.;
- %goto exit;
- %end;
- %if &Weight NE %STR() %then %do;
- %ChkVar(SourceTable=&SourceTable,InputVar=&Weight,FlagVarExists=GSFT_FlagVarExists3);
- %if %SYSFUNC(FIND(&GSFT_FlagVarExists3,0)) NE 0 %then %do;
- %put ERROR: The Weight "&Weight" does not exist in SourceTable, please check it again.;
- %goto exit;
- %end;
- %end;
- /* 检查Statistic的唯一性 */
- %if %SYSFUNC(FIND(&Statistic,%STR( ))) NE 0 %then %do;
- %put ERROR: There should be only one Statistic, please check it again.;
- %goto exit;
- %end;
- /* 确认指定的统计量,N|MIN|MAX|MEAN|STD为StatGroup1,其他非特殊统计量为StatGroup2,SVAR为Group3,GMEAN为Group4 */
- %let Statistic=%UPCASE(&Statistic);
- %let GSFT_StatGroup1=%PrxMatch(InputString=&Statistic,PrxString=/((?<!\w)N(?!\w))|(MIN)|(MAX)|((?<!\w)MEAN(?!\w))|(STD)|(ALL)/);
- /* 将正则表达式GSFT_StatGroup2分拆成两段,以免报错 */
- %let GSFT_StatGroup2a=%PrxMatch(InputString=&Statistic,PrxString=/(CSS)|(RANGE)|(CV)|(SKEW)|(KURT)|(LCLM)|(STDERR)|((?<!\w)SUM(?!\w))|(SUMWGT)|(UCLM)|(MODE)|(USS)|((?<!\w)VAR(?!\w))|(NMISS)|(MEDIAN)/);
- %let GSFT_StatGroup2b=%PrxMatch(InputString=&Statistic,PrxString=/(Q1)|(Q3)|(P1(?!0))|(P5)|(P10)|(P25)|(P50)|(P75)|(P90)|(P95)|(P99)|(QRANGE)|(PRT)|((?<!\w)T(?!\w))/);
- %let GSFT_StatGroup3=%PrxMatch(InputString=&Statistic,PrxString=/(SVAR)/);
- %let GSFT_StatGroup4=%PrxMatch(InputString=&Statistic,PrxString=/(GMEAN|GSUM)/);
- %if &GSFT_StatGroup1 EQ 0 AND &GSFT_StatGroup2a EQ 0 AND &GSFT_StatGroup2b EQ 0 AND &GSFT_StatGroup3 EQ 0 AND &GSFT_StatGroup4 EQ 0 %then %do;
- %put ERROR: The Statistic should be assigned properly, please check it again.;
- %goto exit;
- %end;
- /* 开始进行计算 */
- /* 第一步:得到统计量 */
- /* 情形一:计算第一组统计量 */
- %if &GSFT_StatGroup1 NE 0 %then %do;
- %if &ByFactors NE %STR() %then %do;
- proc sort data=&SourceTable;
- by &ByFactors;
- run;
- proc means data=&SourceTable noprint;
- by &ByFactors;
- var &InputVar;
- weight &Weight;
- output out=GSFT_temp;
- run;
- %end;
- %else %do;
- proc means data=&SourceTable noprint;
- var &InputVar;
- weight &Weight;
- output out=GSFT_temp;
- run;
- %end;
- data &TargetTable;
- set GSFT_temp;
- %if &Statistic NE ALL %then %do;
- if _stat_="&Statistic." then output &TargetTable;
- %end;
- run;
- %end;
- /* 情形二:计算第二组统计量 */
- %else %if &GSFT_StatGroup2a NE 0 OR &GSFT_StatGroup2b NE 0 %then %do;
- %if &ByFactors NE %STR() %then %do;
- proc sort data=&SourceTable;
- by &ByFactors;
- run;
- proc means data=&SourceTable noprint;
- by &ByFactors;
- var &InputVar;
- weight &Weight;
- output out=GSFT_temp &Statistic.=;
- run;
- data &TargetTable;
- retain &ByFactors _TYPE_ _FREQ_ _STAT_ &InputVar;
- set GSFT_temp;
- _STAT_="&Statistic.";
- run;
- %end;
- %else %do;
- proc means data=&SourceTable noprint;
- var &InputVar;
- weight &Weight;
- output out=GSFT_temp &Statistic.=;
- run;
- data &TargetTable;
- retain &ByFactors _TYPE_ _FREQ_ _STAT_ &InputVar;
- set GSFT_temp;
- _STAT_="&Statistic.";
- run;
- %end;
- %end;
- /* 情形三:计算第三组统计量,SVAR,即半方差 */
- %else %if &GSFT_StatGroup3 NE 0 %then %do;
- /* 逗号分隔的ByFactors_Comma用于SQL语句 */
- %if %SYSFUNC(FIND(&ByFactors,%STR( ))) NE 0 %then %do;
- %let ByFactors_Comma=%SYSFUNC(TRANWRD(%SYSFUNC(COMPBL(&ByFactors)),%STR( ),%STR(,)));
- %end;
- %else %let ByFactors_Comma=&ByFactors;
- /* 第一步:计算分组均值 */
- proc sql noprint;
- create table GSFT_Mean as
- select *,Count(*) as _FREQ_,
- %do GSFT_i=1 %to &GSFT_InputVar_Num;
- (mean(&&GSFT_InputVar_Var&GSFT_i.)) as &&GSFT_InputVar_Var&GSFT_i.._Mean
- %if &GSFT_i NE &GSFT_InputVar_Num %then %do;
- ,
- %end;
- %end;
- from &SourceTable
- %if &ByFactors NE %STR() %then %do;
- group by &ByFactors_Comma
- %end;
- ;
- quit;
- /* 第二步:计算分组半离差平方 */
- data GSFT_SqrDev;
- set GSFT_Mean;
- %do GSFT_j=1 %to &GSFT_InputVar_Num;
- if &&GSFT_InputVar_Var&GSFT_j LT &&GSFT_InputVar_Var&GSFT_j.._Mean then &&GSFT_InputVar_Var&GSFT_j.._SqrDev=(&&GSFT_InputVar_Var&GSFT_j..-&&GSFT_InputVar_Var&GSFT_j.._Mean)**2;
- else &&GSFT_InputVar_Var&GSFT_j.._SqrDev=0;
- %end;
- run;
- /* 第三步:计算分组半方差 */
- proc sql noprint;
- create table &TargetTable as
- select distinct
- %if &ByFactors NE %STR() %then %do;
- &ByFactors_Comma,
- %end;
- _FREQ_,
- %do GSFT_k=1 %to &GSFT_InputVar_Num;
- sum(&&GSFT_InputVar_Var&GSFT_k.._SqrDev)/_FREQ_ as &&GSFT_InputVar_Var&GSFT_k
- %if &GSFT_k NE &GSFT_InputVar_Num %then %do;
- ,
- %end;
- %end;
- from GSFT_SqrDev
- %if &ByFactors NE %STR() %then %do;
- group by &ByFactors_Comma
- %end;
- ;
- quit;
- data &TargetTable;
- retain &ByFactors _TYPE_ _FREQ_ _STAT_ &InputVar;
- set &TargetTable;
- _TYPE_=0;
- _STAT_='SVAR';
- run;
- %end;
- /* 情形四:计算第四组统计量,GMEAN(几何平均值)和GSUM(连乘积) */
- %else %if &GSFT_StatGroup4 NE 0 %then %do;
- %if %UPCASE(%SUBSTR(&InputVarType,1,1)) EQ P %then %do; /* 仅对InputVarType的首字母进行判断,P可代表Price/PNL */
- proc sql noprint;
- create table &TargetTable as
- select distinct
- %if &ByFactors NE %STR() %then %do;
- &ByFactors_Comma,
- %end;
- %if %UPCASE(&Statistic) EQ GMEAN %then %do;
- %do GSFT_i=1 %to &GSFT_InputVar_Num;
- (exp(sum(log(&&GSFT_InputVar_Var&GSFT_i))))**(1/COUNT(*)) as &&GSFT_InputVar_Var&GSFT_i
- %if &GSFT_i NE &GSFT_InputVar_Num %then %do;
- ,
- %end;
- %end;
- %end;
- %else %if %UPCASE(&Statistic) EQ GSUM %then %do;
- %do GSFT_j=1 %to &GSFT_InputVar_Num;
- (exp(sum(log(&&GSFT_InputVar_Var&GSFT_j)))) as &&GSFT_InputVar_Var&GSFT_j
- %if &GSFT_j NE &GSFT_InputVar_Num %then %do;
- ,
- %end;
- %end;
- %end;
- from &SourceTable
- %if &ByFactors NE %STR() %then %do;
- group by &ByFactors_Comma
- %end;
- ;
- quit;
- %end;
- %else %if %UPCASE(%SUBSTR(&InputVarType,1,1)) EQ R %then %do;
- proc sql noprint;
- create table &TargetTable as
- select distinct
- %if &ByFactors NE %STR() %then %do;
- &ByFactors_Comma,
- %end;
- %if %UPCASE(&Statistic) EQ GMEAN %then %do;
- %do GSFT_i=1 %to &GSFT_InputVar_Num;
- (exp(sum(log(1+&&GSFT_InputVar_Var&GSFT_i))))**(1/COUNT(*))-1 as &&GSFT_InputVar_Var&GSFT_i
- %if &GSFT_i NE &GSFT_InputVar_Num %then %do;
- ,
- %end;
- %end;
- %end;
- %else %if %UPCASE(&Statistic) EQ GSUM %then %do;
- %do GSFT_j=1 %to &GSFT_InputVar_Num;
- exp(sum(log(1+&&GSFT_InputVar_Var&GSFT_j)))-1 as &&GSFT_InputVar_Var&GSFT_j
- %if &GSFT_j NE &GSFT_InputVar_Num %then %do;
- ,
- %end;
- %end;
- %end;
- from &SourceTable
- %if &ByFactors NE %STR() %then %do;
- group by &ByFactors_Comma
- %end;
- ;
- quit;
- %end;
- %else %do;
- %put ERROR: The parameter InputVarType should be P or R, case insensitive and without quotes.;
- %goto exit;
- %end;
- data &TargetTable;
- retain &ByFactors _TYPE_ _FREQ_ _STAT_ &InputVar;
- set &TargetTable;
- _TYPE_=0;
- %if %UPCASE(&Statistic) EQ GMEAN %then %do;
- _STAT_='GMEAN';
- %end;
- %if %UPCASE(&Statistic) EQ GSUM %then %do;
- _STAT_='GSUM';
- %end;
- run;
- %end;
- /* 第二步:按要求输出表格 */
- %if %UPCASE(&OutputVarType) EQ SUFFIX %then %do;
- proc sql noprint;
- select distinct _STAT_ into :GSFT_StatString separated by ' '
- from &TargetTable;
- quit;
- %SeparateString(InputString=&GSFT_StatString,OutputString=GSFT_StatString);
- data %do GSFT_m=1 %to &GSFT_StatString_Num;
- GSFT_Output_&&GSFT_StatString_Var&GSFT_m(rename=(
- %do GSFT_n=1 %to &GSFT_InputVar_Num;
- &&GSFT_InputVar_Var&GSFT_n..=&&GSFT_InputVar_Var&GSFT_n.._&&GSFT_StatString_Var&GSFT_m
- %end;
- ))
- %end;
- ;
- set &TargetTable;
- %do GSFT_o=1 %to &GSFT_StatString_Num;
- if _STAT_ EQ "&&GSFT_StatString_Var&GSFT_o" then output GSFT_Output_&&GSFT_StatString_Var&GSFT_o;
- %end;
- run;
- data &TargetTable(drop=_STAT_);
- merge %do GSFT_p=1 %to &GSFT_StatString_Num;
- GSFT_Output_&&GSFT_StatString_Var&GSFT_p
- %end;
- ;
- by &ByFactors;
- run;
- %end;
- /* 删除不必要的表格 */
- proc datasets lib=work nolist;
- delete GSFT_:;
- quit;
- %exit:
- %mend;
- %macro Demo();
- %let SourceTable=NavOfFund;
- %let TargetTable=NavOfFund_GMean;
- %let ByFactors=Fund_Code; /* 可设置为多个,用空格分隔 */
- %let InputVar=Adj_NAV Unit_NAV;
- %let InputVarType=; /* 当Statistic=GMEAN时必须设置,=P表示价格变量,=R表示收益率变量 */
- %let OutputVarType=Suffix; /* 输出变量的类型,=Origin表示不作处理,=Suffix表示输出变量用不同的统计量作为后缀 */
- %let Weight=; /* 仅为一个变量 */
- %let Statistic=All; /* 仅为一个统计量 */
- %GetStatsForTable(&SourceTable,&TargetTable,&ByFactors,&InputVar,&InputVarType,&OutputVarType,&Weight,&Statistic);
- /* 计算几何平均收益率的例子,且InputVar变量为价格变量 */
- %let SourceTable=ReturnOfFund_Dy;
- %let TargetTable=ReturnOfFund_GMean;
- %let ByFactors=Fund_Code; /* 可设置为多个,用空格分隔 */
- %let InputVar=Adj_NAV;
- %let InputVarType=P; /* 当Statistic=GMEAN时必须设置,=P表示价格变量,=R表示收益率变量 */
- %let OutputVarType=; /* 输出变量的类型,=Origin表示不作处理,=Suffix表示输出变量用不同的统计量作为后缀 */
- %let Weight=; /* 仅为一个变量 */
- %let Statistic=GMEAN; /* 仅为一个统计量 */
- %GetStatsForTable(&SourceTable,&TargetTable,&ByFactors,&InputVar,&InputVarType,&OutputVarType,&Weight,&Statistic);
- /* 计算几何连乘积的例子,且InputVar变量为收益率变量 */
- %let SourceTable=GARFP_AbnRetOfPort;
- %let TargetTable=GARFP_AbnRetOfPort_GSUM;
- %let ByFactors=Port_Code End_Yr End_Mt; /* 可设置为多个,用空格分隔 */
- %let InputVar=Ret_Dy Ret_Bm AbnRet_Dy AbnRetRatio_Dy;
- %let InputVarType=R; /* 当Statistic=GMEAN时必须设置,=P表示价格变量,=R表示收益率变量 */
- %let OutputVarType=; /* 输出变量的类型,=Origin表示不作处理,=Suffix表示输出变量用不同的统计量作为后缀 */
- %let Weight=; /* 仅为一个变量 */
- %let Statistic=GSUM; /* 仅为一个统计量 */
- %GetStatsForTable(&SourceTable,&TargetTable,&ByFactors,&InputVar,&InputVarType,&OutputVarType,&Weight,&Statistic);
- %mend;