def get_structure(notes, mRNA_start):
aa_start = 1
mRNA_notes = [0]
for note in notes:
mRNA_start += (note-aa_start)*3
aa_start = note
mRNA_notes.append(mRNA_start)
mRNA_notes.append(df['length_mRNA'].sum())
structure = []
for i in range(len(mRNA_notes)-1):
structure.append([mRNA_notes[i], mRNA_notes[i+1]])
return structure
def get_init_term_codon(df_structure, nm_info):
import json
with open('../SiCreen/ncbi_hg38_mRNA.json') as f:
homo_json = json.load(f)
seq = homo_json[nm_info]
initiation_codon = seq[df_structure.loc[0, 'end']-1:df_structure.loc[0, 'end']-1+3]
termination_codon = seq[df_structure.loc[df_structure.shape[0]-2, 'end']-1-3:df_structure.loc[df_structure.shape[0]-2, 'end']-1]
len_3utr = df_structure.loc[df_structure.shape[0]-1, 'end'] - df_structure.loc[df_structure.shape[0]-1, 'start'] + 1
return initiation_codon, termination_codon, len_3utr
notes = [1, 31, 153, 453, 694]
mRNA_start = 291
structure = get_structure(notes, mRNA_start)
features = ["5'UTR", 'SP', 'Pro', 'Catalytic domain', 'C-terminal', "3'UTR"]
df_structure = pd.DataFrame(structure, columns=['start', 'end'])
df_structure['feature'] = features
df_structure['strand'] = '+'
df_structure = df_structure[['feature', 'strand', 'start', 'end']]
pcsk9_info = 'NM_174936.4 Homo sapiens proprotein convertase subtilisin/kexin type 9 (PCSK9), transcript variant 1, mRNA'
initiation_codon, termination_codon, len_3utr = get_init_term_codon(df_structure, pcsk9_info)
df_structure.to_csv('../SiCreen/PCSK9_structure_mRNA_features.txt', sep='\t', index=None)
print(f'initiation_codon: {initiation_codon}')
print(f'termination_codon: {termination_codon}')
print(f'len_3utr: {len_3utr}')
df_structure


雷达卡



京公网安备 11010802022788号







