序列分析

Biopython 脚本库

批量处理 FASTA 文件

from Bio import SeqIO
from pathlib import Path

def batch_process_fasta(input_dir, output_file):
    """合并多个 FASTA 文件"""
    all_records = []

    for fasta_file in Path(input_dir).glob("*.fasta"):
        records = SeqIO.parse(fasta_file, "fasta")
        all_records.extend(records)

    SeqIO.write(all_records, output_file, "fasta")
    print(f"处理完成,共 {len(all_records)} 条序列")

# 使用示例
batch_process_fasta("./sequences/", "merged.fasta")

反向互补序列转换

from Bio.Seq import Seq

def reverse_complement(sequence):
    """获取反向互补序列"""
    seq = Seq(sequence)
    return str(seq.reverse_complement())

# 使用示例
dna = "ATGCGATCGATCG"
rc = reverse_complement(dna)
print(f"原序列: {dna}")
print(f"反向互补: {rc}")

ORF 预测

from Bio.Seq import Seq

def find_orfs(sequence, min_length=100):
    """查找所有可能的 ORF"""
    seq = Seq(sequence)
    orfs = []

    # 检查正向三个阅读框
    for frame in range(3):
        trans = seq[frame:].translate(to_stop=False)
        start = None

        for i, aa in enumerate(trans):
            if aa == 'M' and start is None:
                start = i
            elif aa == '*' and start is not None:
                orf_len = (i - start) * 3
                if orf_len >= min_length:
                    orfs.append({
                        'frame': frame + 1,
                        'start': frame + start * 3,
                        'length': orf_len,
                        'sequence': str(trans[start:i])
                    })
                start = None

    return orfs

# 使用示例
dna = "ATGAAACCCGGGTTTTAG"
orfs = find_orfs(dna, min_length=6)
for orf in orfs:
    print(orf)

序列统计

from Bio.SeqUtils import gc_fraction

def sequence_stats(fasta_file):
    """统计 FASTA 文件中序列的基本信息"""
    from Bio import SeqIO

    stats = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        stats.append({
            'id': record.id,
            'length': len(record.seq),
            'gc_content': gc_fraction(record.seq) * 100
        })

    return stats