简易kozak寻找

一、shell命令

#!/bin/bash
awk '/^>/ { if (seq != "") { printf("%s\n", seq) } seq = ""; print $0 } !/^>/ { seq = seq $0 } END { printf("%s\n", seq) }' Oryza_sativa.IRGSP-1.0.cdna.all.fa > modified_cdna_file.fa
gene_num=$(awk 'NR%2==0' modified _cdna_file.fa | grep -c '^')
# 初始化计数器
count_even_lines=0

# 使用 awk 找到偶数行的第一个 ATG,并统计符合条件的行数
awk 'NR%2==0 && match($0, /ATG/) { 
        atg_pos = RSTART
        # 提取 ATG 前三个字符和后一个字符组成的序列
        atg_seq = substr($0, atg_pos - 3, 7)
        if (atg_pos > 3 && atg_seq ~ /[AG]..ATGG/) {
            count_even_lines++
        }
    } END {
        print "符合 Kozak 序列的基因数:" count_even_lines
    }' modified_cdna_file.fa
echo "水稻cDNA序列含有 $gene_num"

结果:11862个

二、python命令

import re
from Bio import SeqIO

candidate_kozak_num = 0
gene_num = 0
records = SeqIO.parse("Oryza_sativa.IRGSP-1.0.cdna.all.fa", "fasta")
for record in records:
    sequence = str(record.seq)
    start_index = re.search(r'ATG', sequence)
    gene_num += 1
    if start_index:
        start_index = start_index.start()
        if re.match(r'[AG]..ATGG', sequence[start_index - 3:start_index + 4]):
            candidate_kozak_num += 1
print(gene_num, candidate_kozak_num)

结果:42789个基因,11862个kozak序列