if either column A or B has a big deviation, then grouping the observations by their distance to the origin before selecting will have a clear advantage over Cartesian product. For example,
- data hell;
- do n=1 to 1000000;
- a = 1000*rannor(12345);
- b = 10*rannor(12345);
- c = rannor(12345);
- zone = floor(sqrt(a*a+b*b)/.6);*sqrt(0.5^2+0.3^2)<0.6;
- output;
- end;
-
- proc freq data=hell;
- tables zone/nocum nopct;
- run;
- proc sql;
- *create table pairs as
- select A.n as n_a, B.n as n_b, A.a as a_a, B.a as a_b, A.b as b_a, B.b as b_b, A.c as c_a, B.c as c_b
- from hell a, hell b
- where abs(A.a-B.a)<=0.5 and abs(A.b-B.b)<=0.3 and a.n < b.n;
- quit;
- proc sql;
- create table pairs1 as
- select A.n as n_a, B.n as n_b, A.a as a_a, B.a as a_b, A.b as b_a, B.b as b_b, A.c as c_a, B.c as c_b
- from hell A join hell B on A.zone=B.zone
- where abs(A.a-B.a)<=0.5 and abs(A.b-B.b)<=0.3 and a.n < b.n
- outer union corr
- select A.n as n_a, B.n as n_b, A.a as a_a, B.a as a_b, A.b as b_a, B.b as b_b, A.c as c_a, B.c as c_b
- from hell A join hell B on A.zone+1=B.zone
- where abs(A.a-B.a)<=0.5 and abs(A.b-B.b)<=0.3;
- quit;
复制代码However if you data looks like the following, then I have to say that may the Lord bless you!
- data hell;
- do n=1 to 1000000;
- a = rannor(12345);
- b = ranuni(12345);
- c = rannor(12345);
- zone = floor(sqrt(a*a+b*b)/.6);*sqrt(0.5^2+0.3^2)<0.6;
- output;
- end;
复制代码