现在有两个数据集:训练集(train set)和测试集(test set)。
利用proc cluster过程对训练集数据(train set)聚类之后,如何判断测试集数据(test set)中的样本属于哪一类?用SAS怎么实现?
- data train;
- input id x y;
- datalines;
- 1 15.4091 1.5491
- 2 10.9082 -0.9675
- 3 15.3822 1.8183
- 4 9.5898 -2.4073
- 5 9.9458 0.6724
- 6 10.2291 -0.5715
- 7 9.6450 -4.0562
- 8 11.1216 3.2309
- 9 2.0575 3.4359
- 10 11.9938 3.7972
- 11 9.4967 -4.1008
- 12 4.9667 0.6011
- 13 11.2794 -0.8699
- 14 9.6067 -0.7234
- 15 13.0185 0.3925
- 16 4.8990 -1.6354
- 17 11.1679 0.8643
- 18 11.6815 2.3479
- 19 8.4027 -3.3714
- 20 7.7793 -8.8127
- 21 14.3193 5.5325
- 22 2.1550 3.5026
- 23 8.6603 -2.6741
- 24 15.1578 0.5084
- 25 8.8174 -2.9932
- 26 12.9201 -2.7144
- 27 4.6989 -6.6903
- 28 11.9450 -4.5623
- 29 12.9990 -7.0821
- 30 17.3872 -2.0966
- 31 15.8118 -2.2684
- 32 13.7049 -3.4502
- 33 14.4304 -3.6925
- 34 5.7314 -7.9326
- 35 7.3310 -9.1295
- 36 6.4758 -8.3618
- 37 -0.7505 1.9121
- 38 6.1170 -8.2503
- 39 1.9303 -4.5581
- 40 13.8314 -6.2995
- 41 0.8887 -0.6846
- 42 1.0783 -2.7695
- 43 4.9339 -9.5764
- 44 5.8186 -8.1148
- 45 -0.3435 2.8181
- 46 12.4756 -1.5845
- 47 0.7985 4.9212
- 48 14.5799 -2.1837
- 49 -2.4897 -3.4639
- 50 12.1440 -12.9824
- ;
- data test;
- input id x y;
- datalines;
- 51 1.0054 -8.2956
- 52 -0.3614 -1.2892
- 53 -1.3917 3.6621
- 54 0.1904 5.2448
- 55 3.9498 -0.1230
- 56 15.3575 -2.7960
- 57 -0.3440 5.5702
- 58 0.8296 -4.2619
- 59 13.3376 -6.0623
- 60 6.0856 -11.3700
- ;
- run;
- proc cluster data=train method=density R=0.5 outtree=outtree ccc pseudo ;/*里面的参数可以随便改*/
- var x y;
- run;
复制代码