简易kozak寻找
Updated on: 2024-03-17 17:44:49
一、shell命令
#!/bin/bash
awk '/^>/ { if (seq != "") { printf("%s\n", seq) } seq = ""; print $0 } !/^>/ { seq = seq $0 } END { printf("%s\n", seq) }' Oryza_sativa.IRGSP-1.0.cdna.all.fa > modified_cdna_file.fa
gene_num=$(awk 'NR%2==0' modified _cdna_file.fa | grep -c '^')
# 初始化计数器
count_even_lines=0
# 使用 awk 找到偶数行的第一个 ATG,并统计符合条件的行数
awk 'NR%2==0 && match($0, /ATG/) {
atg_pos = RSTART
# 提取 ATG 前三个字符和后一个字符组成的序列
atg_seq = substr($0, atg_pos - 3, 7)
if (atg_pos > 3 && atg_seq ~ /[AG]..ATGG/) {
count_even_lines++
}
} END {
print "符合 Kozak 序列的基因数:" count_even_lines
}' modified_cdna_file.fa
echo "水稻cDNA序列含有 $gene_num"
结果:11862个
二、python命令
import re
from Bio import SeqIO
candidate_kozak_num = 0
gene_num = 0
records = SeqIO.parse("Oryza_sativa.IRGSP-1.0.cdna.all.fa", "fasta")
for record in records:
sequence = str(record.seq)
start_index = re.search(r'ATG', sequence)
gene_num += 1
if start_index:
start_index = start_index.start()
if re.match(r'[AG]..ATGG', sequence[start_index - 3:start_index + 4]):
candidate_kozak_num += 1
print(gene_num, candidate_kozak_num)
结果:42789个基因,11862个kozak序列