序列分析
Biopython 脚本库
批量处理 FASTA 文件
from Bio import SeqIO
from pathlib import Path
def batch_process_fasta(input_dir, output_file):
"""合并多个 FASTA 文件"""
all_records = []
for fasta_file in Path(input_dir).glob("*.fasta"):
records = SeqIO.parse(fasta_file, "fasta")
all_records.extend(records)
SeqIO.write(all_records, output_file, "fasta")
print(f"处理完成,共 {len(all_records)} 条序列")
# 使用示例
batch_process_fasta("./sequences/", "merged.fasta")
反向互补序列转换
from Bio.Seq import Seq
def reverse_complement(sequence):
"""获取反向互补序列"""
seq = Seq(sequence)
return str(seq.reverse_complement())
# 使用示例
dna = "ATGCGATCGATCG"
rc = reverse_complement(dna)
print(f"原序列: {dna}")
print(f"反向互补: {rc}")
ORF 预测
from Bio.Seq import Seq
def find_orfs(sequence, min_length=100):
"""查找所有可能的 ORF"""
seq = Seq(sequence)
orfs = []
# 检查正向三个阅读框
for frame in range(3):
trans = seq[frame:].translate(to_stop=False)
start = None
for i, aa in enumerate(trans):
if aa == 'M' and start is None:
start = i
elif aa == '*' and start is not None:
orf_len = (i - start) * 3
if orf_len >= min_length:
orfs.append({
'frame': frame + 1,
'start': frame + start * 3,
'length': orf_len,
'sequence': str(trans[start:i])
})
start = None
return orfs
# 使用示例
dna = "ATGAAACCCGGGTTTTAG"
orfs = find_orfs(dna, min_length=6)
for orf in orfs:
print(orf)
序列统计
from Bio.SeqUtils import gc_fraction
def sequence_stats(fasta_file):
"""统计 FASTA 文件中序列的基本信息"""
from Bio import SeqIO
stats = []
for record in SeqIO.parse(fasta_file, "fasta"):
stats.append({
'id': record.id,
'length': len(record.seq),
'gc_content': gc_fraction(record.seq) * 100
})
return stats