- #!~/miniconda2/envs/py3/bin/python
- import pysam
- import re
- import sys
- import os
- def divide_UMI(s):
- name = s.query_name.split('|')[1]
- UMI1 = re.findall(r'.{12}', name)[0]
- UMI2 = re.findall(r'.{12}', name)[1]
- return(UMI1,UMI2)
-
- r1 = pysam.AlignmentFile(sys.argv[1], 'rb')
- outfile = pysam.AlignmentFile("raw_duplex.bam","wb",template=r1)
- for s1 in r1:
- r2 = pysam.AlignmentFile(sys.argv[2], 'rb')
- for s2 in r2:
- R1_UMI1,R1_UMI2 = divide_UMI(s1)
- R2_UMI1,R2_UMI2 = divide_UMI(s2)
- if(R1_UMI1==R2_UMI2 and R1_UMI2==R2_UMI1 and s1.pos==s2.pos and s1.pnext==s2.pnext):
- outfile.write(s1)
- outfile.write(s2)
- else:
- continue
- r2.close()
- outfile.close()
- r1.close()
E00548:177:HKH53CCXY:4:1204:1783:5563|TACAGACTGTGGCAAGCAACCGAT 163 chr22 24930312 54 71M67S = 24930312 71 GAGAATTGCTTGGGCAGAGGTTGCAGTGAACTGAGATCATGCCACTGCACTCCAGCCTGGCGACAGAGCGAACCACAGTCTGTAAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTAT JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJAJJJJFJJJJAAJJFJFJJJJJAFJJJJJJJJJJFAJAAF NM:i:0 MD:Z:71 AS:i:71 XS:i:54 RG:Z:L004
E00548:177:HKH53CCXY:4:2210:30573:69678|CAAGCAACCGATTACAGACTGTGG 99 chr22 24930312 54 71M67S = 24930312 71 GAGAATTGCTTGGGCAGAGGTTGCAGTGAACTGAGATCATGCCACTGCACTCCAGCCTGGCGACAGAGCGAACCACAGTCTGTAAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCTCGTATGCCGTCTTCTG JJJJJJJFJJJJJJJJJJFJJJJJJJJJJJJFJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJFJJJJJJJJJJJJAJJJJJJFJJJFFJJJ7 NM:i:0 MD:Z:71 AS:i:71 XS:i:54 RG:Z:L004
这是两行,分别来自打开的两个文件,格式都是相同的,筛选出这两行的条件就是:
1、这两行的第一列中的TACAGACTGTGGCAAGCAACCGAT 与 CAAGCAACCGATTACAGACTGTGG
第一个的前12个字符等于第二个中的后12个字符,第一个的后12个字符等于第二个中的前12个字符,直观一点就是:
TACAGACTGTGG CAAGCAACCGAT
CAAGCAACCGAT TACAGACTGTGG
2、第四列值相同
3、这两行的第8列值也相同
如上代码所示,我是对这两个文件用了两个for循环,满足条件的就写入文件,但是这个代码运行得十分缓慢,请问一下怎么修改可以提高运行速度啊?