data a;
input id $ rs1$ rs2$ rs3$ rs4$;
cards;
ind1 TT GG GG TT
ind2 TT GG GG TT
ind3 TT GG GG TT
ind4 AA GG GG TT
ind5 AT GG GG TT
ind6 TT GG GG TT
ind7 AT GG GG TT
ind8 AT GG GG TT
ind9 AT GG GG TT
ind10 AT GG GG TT
ind11 AA CC GG TT
ind12 TT CG GG TT
ind13 AA GG GA TT
ind14 AT GG GG TT
ind15 TT GC GG CC
ind16 AT CC AA CT
ind17 AT GG AG TC
ind18 AT CG AG TT
ind19 AT CG GG TT
ind20 AA GC GG TT
ind21 TT CC AG TT
ind22 AA CG AA TT
ind23 AT GG GA TC
ind24 TT GG GG TT
ind25 AT GC GG TT
ind26 AT CC AA CC
ind27 AT GG AG CT
ind28 AT CG AG TC
ind29 AT CG GG TT
ind30 AT GC GG TT
;
run;
data b;
set a;
a1=compress(rs1,'A','k');
l1a=length(a1);
t1=compress(rs1,'T','k');
l1t=length(t1);
c2=compress(rs2,'C','k');
l2c=length(c2);
g2=compress(rs2,'G','k');
l2g=length(g2);
a3=compress(rs3,'A','k');
l3a=length(a3);
g3=compress(rs3,'G','k');
l3g=length(g3);
c4=compress(rs4,'C','k');
l4c=length(c4);
t4=compress(rs4,'T','k');
l4t=length(t4);
if a1='' then l1a=0;
if t1='' then l1t=0;
if c2='' then l2c=0;
if g2='' then l2g=0;
if a3='' then l3a=0;
if g3='' then l3g=0;
if c4='' then l4c=0;
if t4='' then l4t=0;
run;
proc sql;
create table bb as
select sum(l1a) as l1a,sum(l1t) as l1t,sum(l2c) as l2c,sum(l2g) as l2g,
sum(l3a) as l3a,sum(l3g) as l3g,sum(l4c) as l4c,sum(l4t) as l4t
from b
;
quit;
/*bb中可以看出rs1, rs2,rs3,rs4少数字母分别为:a c a c*/
data result;
set b;
if a1='AA' then col1=2;
else if a1='A' then col1=1;
else if a1='' then col1=0;
if c2='CC' then col2=2;
else if c2='C' then col2=1;
else if c2='' then col2=0;
if a3='AA' then col3=2;
else if a3='A' then col3=1;
else if a3='' then col3=0;
if c4='CC' then col4=2;
else if c4='C' then col4=1;
else if c4='' then col4=0;
keep id rs1 rs2 rs3 rs4 a1 col1 c2 col2 a3 col3 c4 col4;
run;
/*id是人的编号,rs1, rs2,rs3,rs4是变量。每个变量是2个字母的组合,而且这2个字母一定是A T C G中的2个。
比如第一列是A和T的组合。我想数出来每一列中哪个字母为少数字母,然后该列中2个少数字母的组合=2,
含有一个少数字母=1,不含少数字母=0。
比如第一列A字母数少于T字母数,然后所有的AA=2,AT=1,TA=1,TT=0。另外3列也是如此。*/
|