libname yu 'd:\yu\';
data yu.yu916;
set yu.d1;
length phone_type content_type $50. ggsnip sgsnip ip_destin ip_public ip_terminal $15. ua url_detail url_feature $80.;
count1=count(var1,'|');
count2=count(scan(var1,-1,"|","m"),":");
count3=count(scan(var1,-1,"|","m"),"-");
length30=length(scan(var1,-1,"|","m"));
if count1>=29 and count2=2 and count3=2 and length30=19;
drop count1 count2 count3 length30;
cell_num1=scan(var1,1,"|","m");
visit_over_time1=scan(var1,2,"|","m");
ip_terminal=scan(var1,3,"|","m");
ip_public=scan(var1,4,"|","m");
ip_destin=scan(var1,6,"|","m");
ua=scan(var1,8,"|","m");
longtitude1=scan(var1,10,"|","m");
latitude1=scan(var1,11,"|","m");
lac1=scan(var1,12,"|","m");
ci1=scan(var1,13,"|","m");
phone_type=scan(var1,14,"|","m");
rat_type1=scan(var1,15,"|","m");
imsi1=scan(var1,17,"|","m");
stream_type1=scan(var1,18,"|","m");
up_stream1=scan(var1,19,"|","m");
down_stream1=scan(var1,20,"|","m");
all_stream1=scan(var1,21,"|","m");
url_feature=scan(var1,22,"|","m");
visit_begin_time1=scan(var1,-1,"|","m");
content_type=scan(var1,-2,"|","m");
status1=scan(var1,-3,"|","m");
ggsnip=scan(var1,-4,"|","m");
sgsnip=scan(var1,-5,"|","m");
time_length1=scan(var1,-6,"|","m");
time_lag1=scan(var1,-7,"|","m");
url=scan(var1,23,"|","m");
a=index(url,'(A');
b=index(url,'http://');
a1=a-1;
if a>1 and b=0 then url_detail=trim(substr(url,1,a1));
else if a=0 and b=1 then url_detail=scan(url,2,"/");
else if a=0 and b=0 then delete;
drop var1 url a b a1;
cell_num=input(cell_num1,comma32.);
visit_over_time=input(visit_over_time1,anydtdtm40.);
longtitude=input(longtitude1,best10.5);
latitude=input(latitude1,best9.5);
lac=input(lac1,best5.);
ci=input(ci1,best9.);
rat_type=input(rat_type1,best1.);
imsi=input(imsi1,best15.);
stream_type=input(stream_type1,best3.);
up_stream=input(up_stream1,best32.);
down_stream=input(down_stream1,best32.);
all_stream=input(all_stream1,best32.);
visit_begin_time=input(visit_begin_time1,anydtdtm40.);
status=input(status1,best3.);
time_length=input(time_length1,best30.);
time_lag=input(time_lag1,best30.);
format visit_begin_time datetime19.;
format visit_over_time datetime19.;
drop cell_num1 visit_over_time1 longtitude1 latitude1 lac1 ci1 rat_type1 imsi1 stream_type1 up_stream1 down_stream1 all_stream1
visit_begin_time1 status1 time_length1 time_lag1;
if longtitude='' or latitude='' or cell_num='' then delete;
drop ggsnip sgsnip ip_destin ip_public ip_terminal url_feature;
if longtitude<90 then do;
ab=longtitude;
longtitude=latitude;
latitude=ab;
end;
drop ab;
run;
libname yu 'd:\yu\';
proc sort data=yu.yu916 tagsort sortsize=200m;
by cell_num visit_over_time ;
run; quit;
libname yu 'd:\yu\';
data yu.yu916_2;
set yu.yu916;
by cell_num visit_over_time;
if first.visit_over_time;
deltat=visit_over_time-lag1(visit_over_time);
distance=geodist(latitude,longtitude,lag1(latitude),lag1(longtitude));
if first.cell_num then
do;
timeseq=0;
deltat=.;
distance=.;
end;
timeseq+1;
run;
/*A dataset: deal with var data */
/*all*/
libname yu 'd:\yu\';
data yu.ma916;
set yu.yu916_2;
keep cell_num distance deltat up_stream down_stream all_stream rat_type phone_type;
run;
libname yu 'd:\yu\';
proc sort data=yu.ma916;
by cell_num;
run;quit;
libname yu 'd:\yu\';
data yu.a916;
set yu.ma916;
by cell_num;
retain movement;
retain duration;
retain stream_total;
retain stream_up;
retain stream_down;
p=scan(phone_type,1,'-');
if p='APPLE' then phone_system=1;
else if p^='APPLE' then phone_system=0;
holiday=0;
if rat_type=1 then do; G3=1; G4=0; G2=0; G0=0;end;
else if rat_type=2 then do; G3=0; G4=0; G2=1; G0=0;end;
else if rat_type=6 then do; G3=0; G4=1; G2=0; G0=0;end;
else if rat_type=0 then do; G3=0; G4=1; G2=0; G0=1;end;
if first.cell_num then do;
movement=0;
duration=0;
stream_total=0;
stream_up=0;
stream_down=0;
end;
movement+distance;
duration+deltat;
stream_total+all_stream;
stream_up+up_stream;
stream_down+down_stream;
if last.cell_num then output;
drop distance deltat all_stream up_stream down_stream p phone_type rat_type;
run;
libname yu 'd:\yu\';
data yu.nb916;
set yu.yu916;
keep cell_num url_detail;
run;
/*B dataset: deal with var data*/
libname yu 'd:\yu\';
proc sort data=yu.nb916;
by cell_num;
run;quit;
libname yu 'd:\yu\';
data yu.ba916;
set yu.nb916;
by cell_num;
if url_detail in ('wspeed.qq.com','c.isdspeed.qq.com','isdspeed.qq.com','mmspeed.qq.com','s.isdspeed.qq.com') then y=1;
else y=0;
retain freq;
if first.cell_num then freq=0;
freq+y;
if freq>0 then y1=1;
else if freq=0 then y1=0;
if last.cell_num then output;
drop y url_detail;
run;
/*merge A and B*/
libname yu 'd:\yu\';
data yu.c916;
merge yu.a916 yu.ba916;
by cell_num;
informat date mmddyy8.0;
format date mmddyy8.;
date=09162015;
drop stream_up stream_down;
run;
libname yu 'd:\yu\';
data yu.nb916;
set yu.yu916_2;
keep url_detail cell_num;
run;
libname yu 'd:\yu\';
data yu.moz916;
set yu.nb916;
by cell_num;
if url_detail in ('img.momocdn.com','ap.immomo.com','imgws.wemomo.com','api.immomo.com','imgdnion.wemomo.com','referee.immomo.com',
'momo-img.qiniucdn.com','et.momocdn.com','cdnst.immomo.com','cdnst.momocdn.com','game.immomo.com','m.immomo.com','file-api.immomo.com',
'www.immomo.com','passport.immomo.com','img.immomo.com','game-api.immomo.com','dl.immomo.com','download.immomo.com','ap.wemomo.com','chat.immomo.com',
'hulu.youyuan.com','next.youyuan.com','px1.youyuan.com','mmapp2.youyuan.com','ptw.youyuan.com','touch.youyuan.com','pt.youyuan.com','n.youyuan.com',
'v.youyuan.com','youyuan.com','huluemt.youyuan.com','bang.iosui.youyuan.com',
'api.changba.com','aliimg.changba.com','gapi.changba.com','timeline.api.changba.com','vapi.changba.com',
'xiaoqu.qq.com',
'msg.xianglianai.cn','api.xianglianai.cn','pic0.xianglianai.cn','pic1.xianglianai.cn','a.xianglianai.cn','pic9.xianglianai.cn',
'c.tieba.baidu.com','im.tieba.baidu.com:8000','tieba.baidu.com','im.tieba.baidu.com','static.tieba.baidu.com','tippaj.tieba.baidu.com') then y=1;
else y=0;
retain freq_moz;
if first.cell_num then freq_moz=0;
freq_moz+y;
if freq_moz>0 then moz=1;
else if freq_moz=0 then moz=0;
if last.cell_num then output;
drop y url_detail;
run;
libname yu 'd:\yu\';
data yu.cm916;
merge yu.cc916 yu.moz916;
by cell_num;
run;