python中计算dna序列的GC含量

小鲨鱼2018 / 2023-08-28 / 原文

001、对G、C计数进行统计

[root@pc1 test01]# ls
a.fa  test.py
[root@pc1 test01]# cat a.fa          ## 测试DNA序列
>Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC
>Rosalind_0808
CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
TGGGAACCTGCGGGCAGTAGGTGGAAT
[root@pc1 test01]# cat test.py      ## 统计程序
#!/usr/bin/env python
# -*- coding: utf-8 -*-

in_file = open("a.fa", "r")

dict1 = dict()
for i in in_file:
        i = i.strip()
        if i.startswith(">"):
                temp = i
                dict1[temp] = str()
        else:
                dict1[temp] += i
in_file.close()

dict2 = dict()
for i,j in dict1.items():
        count = 0
        for k in j:
                if k == "C" or k == "G":
                        count += 1
        dict2[i] = count/len(j)

for i,j in dict2.items():
        print("%s %.2f" % (i, j * 100))

max_value = max(dict2.values())
for i,j in dict2.items():
        if j == max_value:
                print(i.replace(">", ""))
                print("%.6f" % (j * 100))
[root@pc1 test01]# python3 test.py       ## 输出结果
>Rosalind_6404 53.75
>Rosalind_5959 53.57
>Rosalind_0808 60.92
Rosalind_0808
60.919540

002、利用函数结构实现

[root@pc1 test01]# ls
a.fa  test.py
[root@pc1 test01]# cat a.fa             ## 测试fasta文件
>Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC
>Rosalind_0808
CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
TGGGAACCTGCGGGCAGTAGGTGGAAT
[root@pc1 test01]# cat test.py          ## 统计程序
#!/usr/bin/env python
# -*- coding: utf-8 -*-

in_file = open("a.fa", "r")

dict1 = dict()
for i in in_file:
        i = i.strip()
        if i[0] == ">":
                temp = i
                dict1[temp] = ""
        else:
                dict1[temp] += i

def gc(sequence):
        return (sequence.count("C") + sequence.count("G")) * 100/len(sequence)

def max_gc(fa):
        dict2 = {}
        for i,j in fa.items():
                dict2[i] = gc(j)
        return max(dict2.items(), key = lambda x:x[1])

print(max_gc(dict1))
[root@pc1 test01]# python3 test.py             ## 统计结果
('>Rosalind_0808', 60.91954022988506)

003、改进

[root@pc1 test01]# ls
a.fa  test.py
[root@pc1 test01]# cat a.fa        ## 测试DNA序列
>Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC
>Rosalind_0808
CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
TGGGAACCTGCGGGCAGTAGGTGGAAT
[root@pc1 test01]# cat test.py     ## 计算程序
#!/usr/bin/env python
# -*- coding: utf-8 -*-

in_file = open("a.fa", "r")

dict1 = dict()
for i in in_file:
        i = i.strip()
        if i[0] == ">":
                temp = i
                dict1[temp] = ""
        else:
                dict1[temp] += i

def gc(sequence):
        return (sequence.count("C") + sequence.count("G")) * 100/len(sequence)

def max_gc(fa):
        dict2 = {}
        for i,j in fa.items():
                dict2[i] = gc(j)
        tmp = max(dict2.items(), key = lambda x:x[1])
        return (tmp[0].lstrip(">") + "\n" + str(tmp[1]))

print(max_gc(dict1))
[root@pc1 test01]# python3 test.py    ## 计算结果
Rosalind_0808
60.91954022988506