Python在生物学领域的简单应用——处理DNA序列

DNA的反向互补序列

  假设我们有一串DNA序列,存在一个名为“dna.txt”的文本文档中。那么,我们该如何用Python输出它的反向序列、互补序列以及反向互补序列呢?
  在这之前,我们不妨定义一个函数,用来打开并读取txt文件。我们把这个函数命名为read_seq(),这个函数的参数为我们的文件路径。需要注意的一点是,在dna.txt文件中,存在着换行符\n和回车符\r(如下图所示),而我们只需要用到代表碱基序列的大写字母。我们可以用replace()方法来替换掉它们。
Alt
  最终,我们定义的read_seq()函数如下:

1
2
3
4
5
6
def read_seq(inputfile):
file = open(inputfile, "r")
seq = file.read()
seq = seq.replace("\n", "")
seq = seq.replace("\r", "")
return seq

  接下来,我们需要定义dna_complement()dna_reverse()dna_revcomp()三个函数。在定义dna_complement()函数的时候,和上面替换换行符和回车符同样的道理,我们可以用replace()方法来替换字符串中代表碱基的大写字母。而定义后两个函数明显容易得多,我们只需要让字符串反向输出就可以了。我们定义的dna_complement()dna_reverse()dna_revcomp()三个函数如下所示:

1
2
3
4
5
6
7
def dna_complement(seq):
seq = seq.upper()
seq = seq.replace('A', 'T')
seq = seq.replace('T', 'A')
seq = seq.replace('C', 'G')
seq = seq.replace('G', 'C')
return seq
1
2
3
def dna_reverse(seq):
seq = seq.upper()
return seq[::-1]
1
2
3
def dna_revcomp(seq):
seq = seq.upper()
return dna_complement(seq)[::-1]

  完整的代码如下所示:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def dna_complement(seq):
seq = seq.upper()
seq = seq.replace('A', 'T')
seq = seq.replace('T', 'A')
seq = seq.replace('C', 'G')
seq = seq.replace('G', 'C')
return seq


def dna_reverse(seq):
seq = seq.upper()
return seq[::-1]


def dna_revcomp(seq):
seq = seq.upper()
return dna_complement(seq)[::-1]


def read_seq(inputfile):
file = open(inputfile, "r")
seq = file.read()
seq = seq.replace("\n", "")
seq = seq.replace("\r", "")
return seq


if __name__ == '__main__':
dna = read_seq("E:\\python_pycharm\\一些Python程序练习\\DNA\\dna.txt")
print(dna) # 原DNA序列
print(dna_complement(dna)) # DNA互补序列
print(dna_reverse(dna)) # DNA反向序列
print(dna_revcomp(dna)) # DNA反向互补序列

  输出结果如下:

1
2
3
4
GGTCAGAAAAAGCCCTCTCCATGTCTACTCACGATACATCCCTGAAAACCACTGAGGAAGTGGCTTTTCAGATCATCTTGCTTTGCCAGTTTGGGGTTGGGACTTTTGCCAATGTATTTCTCTTTGTCTATAATTTCTCTCCAATCTCGACTGGTTCTAAACAGAGGCCCAGACAAGTGATTTTAAGACACATGGCTGTGGCCAATGCCTTAACTCTCTTCCTCACTATATTTCCAAACAACATGATGACTTTTGCTCCAATTATTCCTCAAACTGACCTCAAATGTAAATTAGAATTCTTCACTCGCCTCGTGGCAAGAAGCACAAACTTGTGTTCAACTTGTGTTCTGAGTATCCATCAGTTTGTCACACTTGTTCCTGTTAATTCAGGTAAAGGAATACTCAGAGCAAGTGTCACAAACATGGCAAGTTATTCTTGTTACAGTTGTTGGTTCTTCAGTGTCTTAAATAACATCTACATTCCAATTAAGGTCACTGGTCCACAGTTAACAGACAATAACAATAACTCTAAAAGCAAGTTGTTCTGTTCCACTTCTGATTTCAGTGTAGGCATTGTCTTCTTGAGGTTTGCCCATGATGCCACATTCATGAGCATCATGGTCTGGACCAGTGTCTCCATGGTACTTCTCCTCCATAGACATTGTCAGAGAATGCAGTACATATTCACTCTCAATCAGGACCCCAGGGGCCAAGCAGAGACCACAGCAACCCATACTATCCTGATGCTGGTAGTCACATTTGTTGGCTTTTATCTTCTAAGTCTTATTTGTATCATCTTTTACACCTATTTTATATATTCTCATCATTCCCTGAGGCATTGCAATGACATTTTGGTTTCGGGTTTCCCTACAATTTCTCCTTTACTGTTGACCTTCAGAGACCCTAAGGGTCCTTGTTCTGTGTTCTTCAACTGTTGAAAGCCAGAGTCACTAAAAATGCCAAACACAGAAGACAGCTTTGCTAATACCATTAAATACTTTATTCCATAAATATGTTTTTAAAAGCTTGTATGAACAAGGTATGGTGCTCACTGCTATACTTATAAAAGAGTAAGGTTATAATCACTTGTTGATATGAAAAGATTTCTGGTTGGAATCTGATTGAAACAGTGAGTTATTCACCACCCTCCATTCTCT
CCACACAAAAACCCCACACCAACACAACACACCAAACAACCCACAAAACCACACACCAACACCCAAAACACAACAACAACCAAACCCACAAACCCCAACCCACAAAACCCAAACAAAAACACAAACACAAAAAAAACACACCAAACACCACACCAACAAAACACACCCCCACACAACACAAAAAAACACACAACCCACACCCCAAACCCAAAACACACAACCACACAAAAAAACCAAACAACAACAACACAAAACCACCAAAAAAACCACAAACACACCACAAAACAAAAAAACAAAACAACACACCCCACCACCCAACAACCACAAACAACACAACAACAACACAACACACAAACCAACACAAACACACACAACAACCACAAAAAACACCAAAACCAAAACACACACCAACACACACAAACAACCCAACAAAAACAACAAACACAACAACCAACAACACACACAAAAAAAACAACAACAAACCAAAAAACCACACACCACCACACAAAACACACAAAAACAAAAACACAAAAACCAACAACAACACAACCACAACACAAAACACACAACCCAAACACAACAACACCAAACCCCAACAACCCACAAACAACACCAACAACCACACCACCACACACACCAACCAACAACACCACCAAACACAAACACACACAAACCACAACAAAAACACACACAAACACCACCCCACCCCCCAACCACACACCACACCAACCCAAACAAACCACAACCACCAACACACAAAACAACCCAAAAAACAACAAACACAAAAAACAAACAACAAAAACACCAAAAAAAAAAAAACACAACAAACCCACACCCAAACCAAACACAAAAACCAAACCCCAAACCCAACAAAAACACCAAAACACAACACCAACACACACCCAAACCCACCAACAACACACAACAACAACACAACAAACCCACACACACAAAAAAACCCAAACACACAACACACCAAACCAAAAACCAAAAAAAACAAAAAACCAAAAAAAACAAAAAAAAACCAACAAACAACAACCAAACCACCACACACCAAAACAAAAAAAACACAAACCAAAAAAACACAACAACAAAACAAAACAAAACACCAACCAAACACAAACAAACACACACAAAAACACCACCCACCAAACACA
TCTCTTACCTCCCACCACTTATTGAGTGACAAAGTTAGTCTAAGGTTGGTCTTTAGAAAAGTATAGTTGTTCACTAATATTGGAATGAGAAAATATTCATATCGTCACTCGTGGTATGGAACAAGTATGTTCGAAAATTTTTGTATAAATACCTTATTTCATAAATTACCATAATCGTTTCGACAGAAGACACAAACCGTAAAAATCACTGAGACCGAAAGTTGTCAACTTCTTGTGTCTTGTTCCTGGGAATCCCAGAGACTTCCAGTTGTCATTTCCTCTTTAACATCCCTTTGGGCTTTGGTTTTACAGTAACGTTACGGAGTCCCTTACTACTCTTATATATTTTATCCACATTTTCTACTATGTTTATTCTGAATCTTCTATTTTCGGTTGTTTACACTGATGGTCGTAGTCCTATCATACCCAACGACACCAGAGACGAACCGGGGACCCCAGGACTAACTCTCACTTATACATGACGTAAGAGACTGTTACAGATACCTCCTCTTCATGGTACCTCTGTGACCAGGTCTGGTACTACGAGTACTTACACCGTAGTACCCGTTTGGAGTTCTTCTGTTACGGATGTGACTTTAGTCTTCACCTTGTCTTGTTGAACGAAAATCTCAATAACAATAACAGACAATTGACACCTGGTCACTGGAATTAACCTTACATCTACAATAAATTCTGTGACTTCTTGGTTGTTGACATTGTTCTTATTGAACGGTACAAACACTGTGAACGAGACTCATAAGGAAATGGACTTAATTGTCCTTGTTCACACTGTTTGACTACCTATGAGTCTTGTGTTCAACTTGTGTTCAAACACGAAGAACGGTGCTCCGCTCACTTCTTAAGATTAAATGTAAACTCCAGTCAAACTCCTTATTAACCTCGTTTTCAGTAGTACAACAAACCTTTATATCACTCCTTCTCTCAATTCCGTAACCGGTGTCGGTACACAGAATTTTAGTGAACAGACCCGGAGACAAATCTTGGTCAGCTCTAACCTCTCTTTAATATCTGTTTCTCTTTATGTAACCGTTTTCAGGGTTGGGGTTTGACCGTTTCGTTCTACTAGACTTTTCGGTGAAGGAGTCACCAAAAGTCCCTACATAGCACTCATCTGTACCTCTCCCGAAAAAGACTGG
ACACAAACCACCCACCACAAAAACACACACAAACAAACACAAACCAACCACAAAACAAAACAAAACAACAACACAAAAAAACCAAACACAAAAAAAACAAAACCACACACCACCAAACCAACAACAAACAACCAAAAAAAAACAAAAAAAACCAAAAAACAAAAAAAACCAAAAACCAAACCACACAACACACAAACCCAAAAAAACACACACACCCAAACAACACAACAACAACACACAACAACCACCCAAACCCACACACAACCACAACACAAAACCACAAAAACAACCCAAACCCCAAACCAAAAACACAAACCAAACCCACACCCAAACAACACAAAAAAAAAAAAACCACAAAAACAACAAACAAAAAACACAAACAACAAAAAACCCAACAAAACACACAACCACCAACACCAAACAAACCCAACCACACCACACACCAACCCCCCACCCCACCACAAACACACACAAAAACAACACCAAACACACACAAACACAAACCACCACAACAACCAACCACACACACCACCACACCAACAACCACAACAAACACCCAACAACCCCAAACCACAACAACACAAACCCAACACACAAAACACAACACCAACACAACAACAACCAAAAACACAAAAACAAAAACACACAAAACACACCACCACACACCAAAAAACCAAACAACAACAAAAAAAACACACACAACAACCAACAACACAAACAACAAAAACAACCCAACAAACACACACAACCACACACAAAACCAAAACCACAAAAAACACCAACAACACACACAAACACAACCAAACACACAACACAACAACAACACAACAAACACCAACAACCCACCACCCCACACAACAAAACAAAAAAACAAAACACCACACAAACACCAAAAAAACCACCAAAACACAACAACAACAAACCAAAAAAACACACCAACACACAAAACCCAAACCCCACACCCAACACACAAAAAAACACAACACACCCCCACACAAAACAACCACACCACAAACCACACAAAAAAAACACAAACACAAAAACAAACCCAAAACACCCAACCCCAAACACCCAAACCAACAACAACACAAAACCCACAACCACACACCAAAACACCCAACAAACCACACAACACAACCACACCCCAAAAACACACC

把DNA编码链转录为mRNA

  我们同样通过定义函数的方式来实现DNA编码链的转录。根据生物学相关知识,我们只需要把DNA编码链的T替换成U即可得到mRNA(在此我们忽略DNA的启动子与终止子)。代码实现如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def transcription(seq):
# 这里我们假设给定DNA序列为编码链
seq = seq.upper()
seq = seq.replace('T', 'U')
return seq


def read_seq(inputfile):
file = open(inputfile, "r")
seq = file.read()
seq = seq.replace("\n", "")
seq = seq.replace("\r", "")
return seq


if __name__ == '__main__':
dna = read_seq("E:\\python_pycharm\\一些Python程序练习\\DNA\\dna.txt")
print(dna) # 原DNA序列(编码链)
print(transcription(dna)) # 转录出的mRNA序列

根据DNA编码链序列或mRNA序列翻译出蛋白质序列

  我们先来讨论根据DNA编码链翻译蛋白质。定义一个新函数dna_translate()。在定义函数的过程中,我们需要思考一个问题:众所周知,mRNA翻译蛋白质是从起始密码子开始翻译,到终止密码子停止翻译。如果不考虑这个问题的话很有可能会得到错误的结果。定义函数如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import re


def dna_translate(seq):
table = {
'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M',
'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K',
'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',
'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E',
'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
'TAC': 'Y', 'TAT': 'Y', 'TAA': '_', 'TAG': '_',
'TGC': 'C', 'TGT': 'C', 'TGA': '_', 'TGG': 'W'
}
start_sit = re.search('ATG', seq)
protein = ""
for sit in range(start_sit.end() - 3, len(seq), 3):
protein += table[seq[sit:sit + 3]]
if table[seq[sit:sit + 3]] == '_':
break
return protein

  完整代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import re


def dna_translate(seq):
table = {
'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M',
'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K',
'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',
'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E',
'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
'TAC': 'Y', 'TAT': 'Y', 'TAA': '_', 'TAG': '_',
'TGC': 'C', 'TGT': 'C', 'TGA': '_', 'TGG': 'W'
}
start_sit = re.search('ATG', seq)
protein = ""
for sit in range(start_sit.end() - 3, len(seq), 3):
protein += table[seq[sit:sit + 3]]
if table[seq[sit:sit + 3]] == '_':
break
return protein


def read_seq(inputfile):
file = open(inputfile, "r")
seq = file.read()
seq = seq.replace("\n", "")
seq = seq.replace("\r", "")
return seq


if __name__ == '__main__':
dna = read_seq("E:\\python_pycharm\\一些Python程序练习\\DNA\\dna.txt")
print(dna)
print(dna_translate(dna)[:-1])

  在上面程序的第43行print(dna_translate(dna)[:-1]),在最后加上[:-1]的目的是“删除”掉DNA序列末尾的_(下划线)。输出结果如下:

1
2
GGTCAGAAAAAGCCCTCTCCATGTCTACTCACGATACATCCCTGAAAACCACTGAGGAAGTGGCTTTTCAGATCATCTTGCTTTGCCAGTTTGGGGTTGGGACTTTTGCCAATGTATTTCTCTTTGTCTATAATTTCTCTCCAATCTCGACTGGTTCTAAACAGAGGCCCAGACAAGTGATTTTAAGACACATGGCTGTGGCCAATGCCTTAACTCTCTTCCTCACTATATTTCCAAACAACATGATGACTTTTGCTCCAATTATTCCTCAAACTGACCTCAAATGTAAATTAGAATTCTTCACTCGCCTCGTGGCAAGAAGCACAAACTTGTGTTCAACTTGTGTTCTGAGTATCCATCAGTTTGTCACACTTGTTCCTGTTAATTCAGGTAAAGGAATACTCAGAGCAAGTGTCACAAACATGGCAAGTTATTCTTGTTACAGTTGTTGGTTCTTCAGTGTCTTAAATAACATCTACATTCCAATTAAGGTCACTGGTCCACAGTTAACAGACAATAACAATAACTCTAAAAGCAAGTTGTTCTGTTCCACTTCTGATTTCAGTGTAGGCATTGTCTTCTTGAGGTTTGCCCATGATGCCACATTCATGAGCATCATGGTCTGGACCAGTGTCTCCATGGTACTTCTCCTCCATAGACATTGTCAGAGAATGCAGTACATATTCACTCTCAATCAGGACCCCAGGGGCCAAGCAGAGACCACAGCAACCCATACTATCCTGATGCTGGTAGTCACATTTGTTGGCTTTTATCTTCTAAGTCTTATTTGTATCATCTTTTACACCTATTTTATATATTCTCATCATTCCCTGAGGCATTGCAATGACATTTTGGTTTCGGGTTTCCCTACAATTTCTCCTTTACTGTTGACCTTCAGAGACCCTAAGGGTCCTTGTTCTGTGTTCTTCAACTGTTGAAAGCCAGAGTCACTAAAAATGCCAAACACAGAAGACAGCTTTGCTAATACCATTAAATACTTTATTCCATAAATATGTTTTTAAAAGCTTGTATGAACAAGGTATGGTGCTCACTGCTATACTTATAAAAGAGTAAGGTTATAATCACTTGTTGATATGAAAAGATTTCTGGTTGGAATCTGATTGAAACAGTGAGTTATTCACCACCCTCCATTCTCT
MSTHDTSLKTTEEVAFQIILLCQFGVGTFANVFLFVYNFSPISTGSKQRPRQVILRHMAVANALTLFLTIFPNNMMTFAPIIPQTDLKCKLEFFTRLVARSTNLCSTCVLSIHQFVTLVPVNSGKGILRASVTNMASYSCYSCWFFSVLNNIYIPIKVTGPQLTDNNNNSKSKLFCSTSDFSVGIVFLRFAHDATFMSIMVWTSVSMVLLLHRHCQRMQYIFTLNQDPRGQAETTATHTILMLVVTFVGFYLLSLICIIFYTYFIYSHHSLRHCNDILVSGFPTISPLLLTFRDPKGPCSVFFNC

  类似地,我们可以把上面几个程序综合一下,实现“DNA—(转录)→mRNA—(翻译)→蛋白质”的完整过程。代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import re


def mrna_translate(seq):
table = {
'AUA': 'I', 'AUC': 'I', 'AUU': 'I', 'AUG': 'M',
'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACU': 'T',
'AAC': 'N', 'AAU': 'N', 'AAA': 'K', 'AAG': 'K',
'AGC': 'S', 'AGU': 'S', 'AGA': 'R', 'AGG': 'R',
'CUA': 'L', 'CUC': 'L', 'CUG': 'L', 'CUU': 'L',
'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCU': 'P',
'CAC': 'H', 'CAU': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGU': 'R',
'GUA': 'V', 'GUC': 'V', 'GUG': 'V', 'GUU': 'V',
'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCU': 'A',
'GAC': 'D', 'GAU': 'D', 'GAA': 'E', 'GAG': 'E',
'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGU': 'G',
'UCA': 'S', 'UCC': 'S', 'UCG': 'S', 'UCU': 'S',
'UUC': 'F', 'UUU': 'F', 'UUA': 'L', 'UUG': 'L',
'UAC': 'Y', 'UAU': 'Y', 'UAA': '_', 'UAG': '_',
'UGC': 'C', 'UGU': 'C', 'UGA': '_', 'UGG': 'W'
}
start_sit = re.search('AUG', seq)
protein = ""
for sit in range(start_sit.end() - 3, len(seq), 3):
protein += table[seq[sit:sit + 3]]
if table[seq[sit:sit + 3]] == '_':
break
return protein


def transcription(seq):
# 这里我们假设给定DNA序列为编码链
seq = seq.upper()
seq = seq.replace('T', 'U')
return seq


def read_seq(inputfile):
file = open(inputfile, "r")
seq = file.read()
seq = seq.replace("\n", "")
seq = seq.replace("\r", "")
return seq


if __name__ == '__main__':
dna = read_seq("E:\\python_pycharm\\一些Python程序练习\\DNA\\dna.txt")
print(dna) # 原DNA序列(编码链)
mrna = transcription(dna)
print(mrna) # 转录出的mRNA序列
print(mrna_translate(mrna)[:-1]) # 翻译出的蛋白质序列

写在最后

  如果你有什么更好的想法,欢迎给我留言。
  我的邮箱:1398635912@qq.com