选定特定样本组号-剔除组号内样本-按照x排序取临近k位x对应y的均值做预测值-计算CVErrk然后选定optimal k的思路做的,命令如下。
问题是算出来的结果非常非常的奇怪,随着k的波动(横轴)CVErr完全不是预期的情况,应该是代码有错?实在是看不出来问题,想请大神诊断一下!十分感谢
- di "Question2:K-nearest-neighbors regression with k=100"
- gen predict_y_k100=.
- sort x
- drop x_sort
- gen x_sort=_n
- forvalue i=1/100 {
- preserve
- keep if id==`i'
- local t=x_sort
- restore
- preserve
- keep if x_sort<=`t'+100&x_sort>=`t'-100 //k=100情况下,仅保留相近距离为1的三项
- egen m_y= mean(y) //计算此时y的样本均值
- local tt=m_y //提取预测结果
- restore
- replace predict_y=`tt' if id==`i' //将所得的预测值写入对应预测结果栏
- }
- twoway(scatter y x)(line predict_y x),legend(order(1 "观测值y" 2 "拟合值y,k=100"))
- save question2.gph,replace
- di"Question3:10-fold cross validation"
-
- //randomly dividing 10 fold by generating random number
- gen ttt=runiform(0,1)*10
- sort ttt
- gen n=_n
- gen rc=. //分组变量rc
- forvalue i=0(1)9 {
- replace rc=`i'+1 if n>=`i'*10&n<=(`i'+1)*10&n!=(`i'+1)*10
- } //基于均匀分布随机数的随机分组
- drop ttt n
-
- //estiamte the 10-fold cross knn
- gen pred_y_cross=.
- gen k=.
- forvalue k=1/100 {
- forvalue i=1/100{
- preserve
- keep if id==`i'
- local t=rc
- restore //提取样本点所在分组
- preserve
- drop if rc==`t' //剔除样本点所在分组数据后,提取X序列位置
- sort x
- gen ord_x=_n
- keep if id==`i'
- local tt=ord_x
- restore
- preserve
- drop if rc==`t' //剔除样本点所在分组数据后,提取X序列位置
- sort x
- gen ord_x=_n
- keep if ord_x<=`tt'+`k'&ord_x>=`tt'-`k'
- egen m_y=mean(y)
- local ttt_`i'=m_y //记录下给定k距离下序号i样本的预测值
- restore
- replace pred_y_cross=`ttt_`i'' if id==`i'
- }
- forvalue i=1/10 {
- preserve
- keep if rc==`i'
- gen d2_error_y=(y-pred_y_cross)^2
- egen cve_y=mean(d2_error_y)
- local cve_y`i'=cve_y
- restore
- }
- dis
- replace k=(`cve_y1'+`cve_y2'+`cve_y3'+`cve_y4'+`cve_y5'+`cve_y6'+`cve_y7'+`cve_y8'+`cve_y9'+`cve_y10')/10 if id==`k'
- }