python中计算dna序列的GC含量
001、对G、C计数进行统计
[root@pc1 test01]# ls a.fa test.py [root@pc1 test01]# cat a.fa ## 测试DNA序列 >Rosalind_6404 CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC TCCCACTAATAATTCTGAGG >Rosalind_5959 CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT ATATCCATTTGTCAGCAGACACGC >Rosalind_0808 CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC TGGGAACCTGCGGGCAGTAGGTGGAAT [root@pc1 test01]# cat test.py ## 统计程序 #!/usr/bin/env python # -*- coding: utf-8 -*- in_file = open("a.fa", "r") dict1 = dict() for i in in_file: i = i.strip() if i.startswith(">"): temp = i dict1[temp] = str() else: dict1[temp] += i in_file.close() dict2 = dict() for i,j in dict1.items(): count = 0 for k in j: if k == "C" or k == "G": count += 1 dict2[i] = count/len(j) for i,j in dict2.items(): print("%s %.2f" % (i, j * 100)) max_value = max(dict2.values()) for i,j in dict2.items(): if j == max_value: print(i.replace(">", "")) print("%.6f" % (j * 100)) [root@pc1 test01]# python3 test.py ## 输出结果 >Rosalind_6404 53.75 >Rosalind_5959 53.57 >Rosalind_0808 60.92 Rosalind_0808 60.919540

002、利用函数结构实现
[root@pc1 test01]# ls a.fa test.py [root@pc1 test01]# cat a.fa ## 测试fasta文件 >Rosalind_6404 CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC TCCCACTAATAATTCTGAGG >Rosalind_5959 CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT ATATCCATTTGTCAGCAGACACGC >Rosalind_0808 CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC TGGGAACCTGCGGGCAGTAGGTGGAAT [root@pc1 test01]# cat test.py ## 统计程序 #!/usr/bin/env python # -*- coding: utf-8 -*- in_file = open("a.fa", "r") dict1 = dict() for i in in_file: i = i.strip() if i[0] == ">": temp = i dict1[temp] = "" else: dict1[temp] += i def gc(sequence): return (sequence.count("C") + sequence.count("G")) * 100/len(sequence) def max_gc(fa): dict2 = {} for i,j in fa.items(): dict2[i] = gc(j) return max(dict2.items(), key = lambda x:x[1]) print(max_gc(dict1)) [root@pc1 test01]# python3 test.py ## 统计结果 ('>Rosalind_0808', 60.91954022988506)

003、改进
[root@pc1 test01]# ls a.fa test.py [root@pc1 test01]# cat a.fa ## 测试DNA序列 >Rosalind_6404 CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC TCCCACTAATAATTCTGAGG >Rosalind_5959 CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT ATATCCATTTGTCAGCAGACACGC >Rosalind_0808 CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC TGGGAACCTGCGGGCAGTAGGTGGAAT [root@pc1 test01]# cat test.py ## 计算程序 #!/usr/bin/env python # -*- coding: utf-8 -*- in_file = open("a.fa", "r") dict1 = dict() for i in in_file: i = i.strip() if i[0] == ">": temp = i dict1[temp] = "" else: dict1[temp] += i def gc(sequence): return (sequence.count("C") + sequence.count("G")) * 100/len(sequence) def max_gc(fa): dict2 = {} for i,j in fa.items(): dict2[i] = gc(j) tmp = max(dict2.items(), key = lambda x:x[1]) return (tmp[0].lstrip(">") + "\n" + str(tmp[1])) print(max_gc(dict1)) [root@pc1 test01]# python3 test.py ## 计算结果 Rosalind_0808 60.91954022988506

。