提交饱和编辑的相关设计，及检验代码

2026-02-26 14:02:42 +08:00
commit cb556b47c0
36 changed files with 5437 additions and 0 deletions
--- a/design/main.py
+++ b/design/main.py
@@ -0,0 +1,450 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import json
+import math
+import os
+import re
+import sys
+from glob import glob
+import requests as rq
+import pandas as pd
+from loguru import logger
+import numpy as np
+from src.mutation import design_mutations_for_orf
+from src.reader import (extract_orf_sequence, get_cds_for_gene,
+                        load_uniprot_region, read_gtf, Region)
+from src.liftover import convert_interval
+from src.snp import decode_snp, generate_sequences_with_combinations
+import itertools
+from src.editseq import run_analysis
+
+
+# 清除默认的 handler
+logger.remove()
+
+# 添加一个只输出 INFO 及以上级别日志的 sink（如控制台）
+# logger.add(level="INFO")
+logger.add(
+    sys.stderr,
+    colorize=True,
+    format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>",
+    # <cyan>{name}</cyan>: <cyan>{function}</cyan>: <cyan>{line}</cyan>
+    level="INFO"
+)
+
+
+def split_regions(cds):
+    u"""
+    切分原本的cds为3bp的氨基酸reigon
+    测试用例
+    14:103698801-103699017
+    14:103699133-103699179
+    14:103699364-103699576
+    14:103703173-103703327
+    14:103707003-103707215
+    14:103708522-103708659
+    14:103711033-103711087
+    """
+    regions = []
+    cds = sorted(cds, key=lambda x: (x.chrom, x.start, x.end))
+    aa_codon_len = 3
+    start = 0
+    
+    for x in cds:
+        # 如果start为0，则直接从目前的区域开始
+        if start == 0:
+            start = x.start
+        elif start < 0:
+            # 如果start为负值，说明上一个cds并不能完整划分为不同的aa，
+            # 因此，需要单独将起始的小区域单独写出来
+            regions.append(Region(x.chrom, x.start, x.start - start, kind="start"))
+            regions[-1].addition = x
+            start = x.start - start
+        
+        while start + aa_codon_len <= x.end:
+            # 记录下是否跨边界，以及跨的是哪一个边界
+            code = "regular"
+            if start == x.start:
+                code = "start"
+            elif start + aa_codon_len == x.end:
+                code = "end"
+            
+            regions.append(Region(x.chrom, start, start + aa_codon_len, kind=code))
+            regions[-1].addition = x
+            start += aa_codon_len
+
+        if start < x.end:
+            # 如果是跨到end边界上了，那么就记录跨的边界
+            regions.append(Region(x.chrom, start, x.end, kind="end"))
+            regions[-1].addition = x
+            start = start - x.end + 1
+        else:
+            # 如果没有，则把start的指针归零
+            start = 0
+
+    return regions
+
+
+def download_uniprot_region(protein, output):
+    resp = output.replace(".tsv", ".json")
+    url = f"https://www.ebi.ac.uk/proteins/api/coordinates?accession={protein}"
+
+    if os.path.exists(resp):
+        with open(resp, "r") as r:
+            resp = json.load(r)
+    else:
+        resp = rq.get(url, headers={"Accept": "application/json"})
+        resp = resp.json()
+
+        with open(output.replace(".tsv", ".json"), "w+") as w:
+            json.dump(resp, w, indent=4)
+
+    if not resp[0]["name"].endswith("HUMAN"):
+        raise ValueError(f"protein is not human")
+
+    __chroms__ = [str(x) for x in range(1, 23)] + ["chr" + str(x) for x in range(1, 23)] + ["X", "Y", "chrX", "chrY"]
+
+    with open(output, "w+") as w:
+        w.write(f"#{url}\n")
+        for coord in resp[0]["gnCoordinate"]:
+            chromosome = coord["genomicLocation"]["chromosome"]
+
+            if chromosome not in __chroms__:
+                continue
+
+            for row in coord["genomicLocation"]["exon"]:
+                genome = row["genomeLocation"]
+                genome = str(genome["begin"]["position"]) + "-" + str(genome["end"]["position"])
+
+                protein = row["proteinLocation"]
+
+                if "end" not in protein and "position" in protein:
+                    protein = [str(protein["position"]["position"]), "-", str(protein["position"]["position"])]
+                else:
+                    protein = [str(protein["begin"]["position"]), "-", str(protein["end"]["position"])]
+
+                row = f"{chromosome}:{genome}\t{'\t'.join(protein)}"
+                w.write(row + "\n")
+
+            break
+
+
+def get_aa_coords(genes, output):
+    os.makedirs(output, exist_ok=True)
+    df = pd.read_excel(genes)
+    # df = df.loc[df["Batch"] == 1, :]
+
+    for _, row in df.iterrows():
+        gene_name = row[1]
+        
+        url = f"https://rest.uniprot.org/uniprotkb/search?query=gene:{gene_name}+AND+organism_id:9606+AND+reviewed:true&format=json"
+        resp = rq.get(url)
+        
+        for row in resp.json().get("results", []):
+            if "HUMAN" in row["uniProtkbId"]:
+                priority = row["primaryAccession"]
+                download_uniprot_region(priority, os.path.join(output, f"{gene_name}_{priority}.tsv"))
+                break
+        
+        
+
+def adjust_cross_border_region(row):
+    start = row.start
+    end = row.end
+    if len(row) < 3 and "cross" in row.kind:
+        if row.kind == "cross_start":
+            start = end - 3
+        else:
+            end = start + 3
+        return f"{row.chrom}:{start}-{end}"
+    return str(row)
+
+
+def design_by_aa(genes, fasta, output, stop_codon = False):
+    u""" 根据氨基酸设计错配的位点和错配规则 """
+    df = []
+    for gene in glob(os.path.join(genes, "*.tsv")):
+        logger.info(f"开始设计突变 {gene}...")
+        key = os.path.basename(gene).split(".")[0]
+    
+        # 读取已有的区域
+        cds = load_uniprot_region(gene)
+        
+        # 按照氨基酸位置划分
+        cds = split_regions(cds)
+
+        if not cds:
+            continue
+
+        # 提取序列
+        cds = extract_orf_sequence(fasta, cds, half_open=True)
+
+        for idx, x in enumerate(cds):
+            for strategy in ["3N"]:
+                results = design_mutations_for_orf(x.sequence, strategy=strategy)
+                for res in results:
+                    for var in res["variants"]:
+                        if var == res["original_codon"]:
+                            continue
+                        
+                        # 如果1个bp在起点，则说明2bp在前边end，该位点的前2bp必须与记录的内含子相同
+                        if "cross_start" == x.kind and len(x) == 1 and var[:2] != x.sequence[:2]:
+                            continue
+                        
+                        # 如果2bp在起点，则说明1bp在前边end，则该位点的第一个碱基必须为内含子相同位点
+                        elif "cross_start" == x.kind and len(x) == 2 and var[0] != x.sequence[0]:
+                            continue
+                        
+                        # 如果1bp在end，则说明2bp在后边，则该位点的2bp位点必须与内含子相同
+                        elif "cross_end" == x.kind and len(x) == 1 and var[1:] != x.sequence[1:]:
+                            continue
+                        
+                        # 如果1bp在end，则说明1bp在后边，则该位点的1bp位点必须为C或G
+                        elif "cross_end" == x.kind and len(x) == 2 and var[-1] not in ["C", "G"]:
+                            continue
+                        
+                        row = [key, str(x.addition), idx+1, str(x), adjust_cross_border_region(x), x.kind, strategy, res["original_codon"], var]
+                        df.append(row)
+
+    df = pd.DataFrame(df)
+    df.columns = ["gene", "cds_region", "aa_index", "aa_region", "region_with_intron", "cross_cds_border", "strategy",
+                  "origial_code", "mutation_code"]
+    strategy = []
+    for _, row in df.iterrows():
+        match = np.sum([x == y for x, y in zip(row["origial_code"], row["mutation_code"])])
+        strategy.append(f"{3-match}N")
+
+    df["strategy"] = strategy
+    
+    if stop_codon:
+        df = df[df["mutation_code"].isin(["TAA", "TAG", "TGA"])]
+
+    df.to_csv(output, index = False)
+
+
+def design_by_snp(snp_info, targets, genes, fasta, fasta_hg38, output):
+    logger.info("读取染色体")
+    chroms = {}
+    starts = {}
+    for gene in glob(os.path.join(genes, "*.tsv")):
+        key = os.path.basename(gene).split(".")[0]
+        cds = load_uniprot_region(gene)
+        cds = sorted(cds, key=lambda x:[x.chrom, x.start, x.end])
+        chroms[key] = cds[0].chrom
+        starts[key] = cds[0].start
+    
+    logger.info(f"读取snp的信息：{snp_info}")
+    all_sheets = pd.read_excel(snp_info, sheet_name=None)
+
+    # 遍历所有工作表
+    res = {}
+    for sheet_name, df in all_sheets.items():
+        temp = {}
+        for _, row in df.iterrows():
+            cdna = row["DNA change (cDNA) "]
+            hg38 = row["DNA change (genomic) (hg19)     "]
+            temp[cdna] = hg38
+        for sheet in re.split(r"[\(（\s\)）]", sheet_name):
+            res[sheet] = temp
+
+    print(res.keys())
+
+    logger.info(f"读取目标：{targets}")
+    df = pd.read_excel(targets, sheet_name=2)
+    
+    with open(output, "w+") as w:
+        w.write(",".join(["gene", "cdna code", "genomic code", "mutation_region", "version", "original_codon", "mutation_code"]) + "\n")
+        for column in df.columns:
+            if "Unnamed" in column:
+                continue
+
+            for code in df[column]:
+                if not isinstance(code, str) and math.isnan(code):
+                    continue
+
+                genomic_code = res.get(column, {}).get(code)
+
+                if genomic_code:
+                    sites, rule = decode_snp(genomic_code)
+                elif str(code).startswith("c."):
+                    sites, rule = decode_snp(code, ref_start=starts["FANCD2" if column == "FAND2" else column])
+                else:
+                    continue
+
+                region = Region(chroms["FANCD2" if column == "FAND2" else column], start=sites[0], end=sites[-1])
+
+                hg38 = False
+                if genomic_code:
+                    region = extract_orf_sequence(fasta, [region])[0]
+                elif str(code).startswith("c."):
+                    hg38 = True
+                    region = extract_orf_sequence(fasta_hg38, [region])[0]
+
+                original, replacement = "", ""
+                if ">" in rule:
+                    original, replacement = rule.split(">")
+                    original = region.sequence
+                elif rule == "dup":
+                    original = region.sequence
+                    replacement = original * 2
+                elif rule == "del":
+                    original = region.sequence
+                    replacement = ""
+                elif rule == "ins":
+                    replacement = region.sequence
+                elif "delins" in rule:
+                    original = region.sequence
+                    replacement = rule.replace("delins", "")
+                elif "ins" in rule:
+                    original = region.sequence
+                    replacement = rule.replace("ins", "")
+
+                if not genomic_code:
+                    genomic_code = ""
+
+                # 序列中所有N替换后的排列组合
+                for o, r in itertools.product(generate_sequences_with_combinations(original), generate_sequences_with_combinations(replacement)):
+                    w.write(",".join([column, code.strip(), str(genomic_code).strip(), str(region), "hg38" if hg38 else "hg19", o, r]) + "\n")
+
+    # data = pd.DataFrame(data)
+    # data.columns = ["gene", "cdna code", "genomic code", "mutation_region", "original_codon", "mutation_code"]
+    # data.to_csv(output, index = False)
+
+
+
+def extract_fastq_seq(fastq: str, chrom, start, end):
+    import pysam
+    with pysam.FastaFile(fastq) as fh:
+        rec = fh.fetch(str(chrom), start, end)
+        # print(rec)
+        return rec
+
+
+def decode_mutation(rule: str, sequence):
+    original, replacement = "", ""
+    if ">" in rule:
+        original, replacement = rule.split(">")
+        original = sequence
+    elif rule == "dup":
+        original = sequence
+        replacement = original * 2
+    elif rule == "del":
+        original = sequence
+        replacement = ""
+    elif rule == "ins":
+        replacement = sequence
+    elif "delins" in rule:
+        original = sequence
+        replacement = rule.replace("delins", "")
+    elif "ins" in rule:
+        original = sequence
+        replacement = rule.replace("ins", "")
+    return original, replacement
+
+
+def design_by_hmgd(data, fasta, outfile):
+    import re
+    res = pd.read_csv(data)
+    # print(res.head())
+
+    # hgvs
+    # chromosome
+    # startCoord
+    # endCoord
+
+    data = []
+    for idx, row in res.iterrows():
+
+        key = row["gene"] + "_" + str(idx)
+
+        try:
+            seq = extract_fastq_seq(fasta, int(row["chromosome"]), row["startCoord"] - 1,row["endCoord"])
+
+            seq, replace = decode_mutation(row["hgvs"], seq)
+
+            if not seq:
+                continue
+            replace = re.sub(r"[\d_]", "", replace)
+
+            if "del" in replace:
+                replace = ""
+
+            print(key, seq, replace)
+
+            before = extract_fastq_seq(fasta, int(row["chromosome"]), row["startCoord"] - 1 - 100, row["startCoord"])
+            after = extract_fastq_seq(fasta, int(row["chromosome"]), row["endCoord"], row["endCoord"] + 100)
+
+
+            seq = f"{before}({seq}/{replace}){after}"
+            data.append({"sequence_name": key, "editseq": seq})
+        except Exception:
+            continue
+
+    data = pd.DataFrame(data)
+    data.to_csv(outfile, index=False)
+
+
+
+
+if __name__ == "__main__":
+    from fire import Fire
+
+    # get_aa_coords(
+    #     "../metainfo/Cancer and blood disorder panels_v2.xlsx",
+    #     "../gene_coords/batch2"
+    # )
+    
+    # get_aa_coords(
+    #     "../metainfo/DDR gene library in 2021 Cell.xlsx",
+    #     "../gene_coords/positive"
+    # )
+    
+
+    # # Fire({"aa": design_by_aa})
+    # design_by_aa(
+    #     "../gene_coords/batch2",
+    #     fasta="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
+    #     output="../gene_aa_target_batch2.csv.gz"
+    # )
+    
+    # design_by_aa(
+    #     "../gene_coords/positive",
+    #     fasta="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
+    #     output="../gene_aa_target_positive.csv.gz",
+    #     stop_codon = True
+    # )
+    
+    # run_analysis(
+    #     "../gene_aa_target_batch2.csv.gz",
+    #     reference="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
+    #     outdir="../../prediction/input/batch2"
+    # )
+    
+    # run_analysis(
+    #     "../gene_aa_target_positive.csv.gz",
+    #     reference="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
+    #     outdir="../../prediction/input/positive"
+    # )
+
+    # 生成snp结构，snp_info是整理完的snp信息
+    # targets是记录了需要处理的基因
+    # design_by_snp(
+    #     snp_info="../metainfo/副本FA家族基因-20250829-DJJ_XD.xlsx",
+    #     targets="../metainfo/实验计划.xlsx",
+    #     output="gene_snp_target.csv",
+    #     fasta="../ref/gencode/GRCh37.p13.genome.fa.gz",
+    #     fasta_hg38="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
+    #     genes="../gene_coords"
+    # )
+
+    # design_by_hmgd(
+    #     "../metainfo/allmut.csv",
+    #     fasta="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
+    #     outfile="../../prediction/input/pos_v2.csv.gz"
+    # )
+
+
+    # url = "https://www.ebi.ac.uk/proteins/api/coordinates?accession=P21359-1"
+    # download_uniprot_region("Test", "P21359")
+
+
--- a/design/pyproject.toml
+++ b/design/pyproject.toml
@@ -0,0 +1,16 @@
+[project]
+name = "pgrna"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "biopython>=1.85",
+    "fire>=0.7.1",
+    "loguru>=0.7.3",
+    "openpyxl>=3.1.5",
+    "pandas>=2.3.3",
+    "pyfaidx>=0.9.0.3",
+    "pyliftover>=0.4.1",
+    "rich>=14.2.0",
+]
--- a/design/src/editseq.py
+++ b/design/src/editseq.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+created by lanzl
+modified by zym
+"""
+
+import os
+import pandas as pd
+import re
+import sys
+from pyfaidx import Fasta, FetchError
+
+HG19_FASTA_PATH = "/rawdata1/project/peRNA_design/ref/gencode/GRCh37.p13.genome.fa.gz"
+HG38_FASTA_PATH = "/rawdata1/project/peRNA_design/ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
+
+
+def parse_region(region_str: str) -> tuple:
+    """解析 'chrom:start-end' 格式的基因组区域，并确保染色体名带有 'chr' 前缀。"""
+    match = re.match(r"(\w+):(\d+)-(\d+)", region_str)
+    chrom, start, end = match.groups()
+
+    if not chrom.lower().startswith("chr"):
+        chrom = "chr" + chrom
+
+    return chrom, int(start), int(end)
+
+
+def extract_orf_sequence(genome: Fasta, chrom: str, start: int, end: int) -> str:
+    """
+    从预加载的 Fasta 对象中提取序列（1-based inclusive）。
+    """
+    try:
+        sequence = str(genome.get_seq(chrom, start, end))
+        return sequence.upper()
+    except (KeyError, FetchError) as e:
+        alt_chrom = chrom
+        if chrom.lower().startswith("chr"):
+            alt_chrom = chrom[3:]
+
+        if alt_chrom != chrom:
+            try:
+                sequence = str(genome.get_seq(alt_chrom, start, end))
+                return sequence.upper()
+            except (KeyError, FetchError) as inner_e:
+                raise FetchError(
+                    f"Requested rname '{chrom}' (also tried '{alt_chrom}') does not exist in FASTA index."
+                ) from inner_e
+        raise e
+
+
+def generate_editseq(
+    original: str,
+    replacement: str,
+    region_str: str,
+    genome: Fasta,
+    flank_size: int = 100,
+) -> str:
+    """构造 EditSeq 序列字符串（突变位点 + 100bp 侧翼）。"""
+    chrom, mut_start, mut_end = parse_region(region_str)
+
+    # 计算侧翼坐标
+    upstream_start = mut_start - flank_size
+    upstream_end = mut_start - 1
+    downstream_start = mut_end + 1
+    downstream_end = mut_end + flank_size
+
+    # 提取侧翼序列
+    upstream_flank = extract_orf_sequence(genome, chrom, upstream_start, upstream_end)
+    downstream_flank = extract_orf_sequence(
+        genome, chrom, downstream_start, downstream_end
+    )
+
+    original = str(original).strip()
+    replacement = str(replacement).strip()
+
+    # --- 突变逻辑：所有替换（等长或不等长）统一使用 (ORIGINAL/REPLACEMENT) 格式 ---
+    mut_part = ""
+    if original and replacement:
+        # 替换或 Delins: (ORIGINAL/REPLACEMENT)
+        mut_part = f"({original}/{replacement})"
+    elif original:
+        # 删除: (-ORIGINAL)
+        mut_part = f"(-{original})"
+    elif replacement:
+        # 插入: (+REPLACEMENT)
+        mut_part = f"(+{replacement})"
+    else:
+        mut_part = "(Invalid mutation logic)"
+
+    return f"{upstream_flank}{mut_part}{downstream_flank}"
+
+
+# --- 氨基酸突变处理 ---
+def process_aa_mutations(df_aa: pd.DataFrame, genome_hg38: Fasta) -> pd.DataFrame:
+    """处理氨基酸（AA）饱和诱变数据，并返回包含 EditSeq, strategy 和 mutation_type 的 DataFrame。"""
+    results = []
+
+    # 性能优化：使用 to_dict('records') 替代 iterrows()
+    for row in df_aa.to_dict("records"):
+        original = (
+            str(row["origial_code"]).strip() if pd.notna(row["origial_code"]) else ""
+        )
+        replacement = (
+            str(row["mutation_code"]).strip() if pd.notna(row["mutation_code"]) else ""
+        )
+
+        # 确定突变类型
+        if original and replacement:
+            mut_type = "REPL"
+        elif original and not replacement:
+            mut_type = "DEL"
+        elif not original and replacement:
+            mut_type = "INS"
+        else:
+            mut_type = "UNKNOWN"
+
+        # 生成序列名称
+        seq_name = f"{row['gene']}_AA{row['aa_index']}_{row['origial_code']}_{row['mutation_code']}"
+
+        # 生成 EditSeq
+        editseq = generate_editseq(
+            original=row["origial_code"],
+            replacement=row["mutation_code"],
+            region_str=row["aa_region"],
+            genome=genome_hg38,
+        )
+
+        # 收集结果，包括 'strategy' 和 'mutation_type' 列
+        results.append(
+            {
+                "sequence_name": seq_name,
+                "editseq": editseq,
+                "strategy": row["strategy"],
+                "mutation_type": mut_type,
+            }
+        )
+
+    return pd.DataFrame(results)
+
+
+# --- SNP/cDNA 突变处理 ---
+def process_snp_mutations(
+    df_snp: pd.DataFrame, genome_hg19: Fasta, genome_hg38: Fasta
+) -> pd.DataFrame:
+    """处理 SNP/cDNA 突变数据，返回包含 EditSeq 和 mutation_type 的 DataFrame。"""
+    results = []
+
+    # 性能优化：使用 to_dict('records') 替代 iterrows()
+    for row in df_snp.to_dict("records"):
+        original = (
+            str(row["original_codon"]).strip()
+            if pd.notna(row["original_codon"])
+            else ""
+        )
+        replacement = (
+            str(row["mutation_code"]).strip() if pd.notna(row["mutation_code"]) else ""
+        )
+
+        # 根据版本选择合适的 Fasta 对象
+        version = str(row["version"]).lower()
+        genome_to_use = genome_hg38 if version == "hg38" else genome_hg19
+
+        # 确定用于序列命名的突变类型
+        if original and replacement:
+            mut_type = "REPL"
+        elif original and not replacement:
+            mut_type = "DEL"
+        elif not original and replacement:
+            mut_type = "INS"
+        else:
+            mut_type = "UNKNOWN"
+
+        # 构造序列名称
+        cdna_code_clean = str(row["cdna code"]).replace(".", "").replace("_", "p")
+        seq_name = f"{row['gene']}_{mut_type}_{cdna_code_clean}"
+
+        # 生成 EditSeq
+        editseq = generate_editseq(
+            original=original,
+            replacement=replacement,
+            region_str=str(row["mutation_region"]),
+            genome=genome_to_use,
+        )
+
+        results.append(
+            {"sequence_name": seq_name, "editseq": editseq, "mutation_type": mut_type}
+        )
+
+    return pd.DataFrame(results)
+
+
+def run_analysis(infile, reference, outdir):
+    # AA_INPUT_FILE = "/rawdata1/project/peRNA_design/gene_aa_target.csv"
+    # SNP_INPUT_FILE = "/rawdata1/project/peRNA_design/gene_snp_target.csv"
+
+    # genome_hg19 = Fasta(HG19_FASTA_PATH)
+    # genome_hg38 = Fasta(HG38_FASTA_PATH)
+    genome = Fasta(reference)
+    os.makedirs(outdir, exist_ok=True)
+
+    aa_df_input = pd.read_csv(infile)
+    # snp_df_input = pd.read_csv(SNP_INPUT_FILE)
+
+    # --- 阶段一: 处理 SNP/cDNA 突变并输出 ---
+    # snp_df = process_snp_mutations(snp_df_input, genome_hg19, genome_hg38)
+    # snp_output_file = "snp_editseq_output.csv"
+    # snp_df.to_csv(snp_output_file, index=False)
+
+    # --- 阶段二: 按 strategy 分组处理 AA 突变并输出 ---
+    aa_df_input["strategy"] = aa_df_input["strategy"].str.upper()
+    strategies = aa_df_input["strategy"].unique()
+
+    for strategy in strategies:
+        # 跳过空的 strategy
+        if pd.isna(strategy):
+            continue
+
+        # 过滤 DataFrame
+        aa_subset_df = aa_df_input[aa_df_input["strategy"] == strategy].copy()
+        if aa_subset_df.empty:
+            continue
+
+        # 处理子集
+        aa_df_processed = process_aa_mutations(aa_subset_df, genome)
+
+        # 保存
+        aa_output_file = f"aa_{strategy}_editseq_output.csv"
+        aa_df_processed.to_csv(os.path.join(outdir, aa_output_file), index=False)
+
+
+if __name__ == "__main__":
+    run_analysis()
--- a/design/src/liftover.py
+++ b/design/src/liftover.py
@@ -0,0 +1,53 @@
+import pandas as pd
+
+from pyliftover import LiftOver
+
+# lo = LiftOver('/home/zym/projects/pgRNA/liftover/hg19ToHg38.over.chain.gz')
+
+from pyliftover import LiftOver
+
+# 创建从 hg19 到 hg38 的转换器
+lo = LiftOver("hg19", "hg38")
+
+
+def convert_interval(chrom, start, end):
+    """
+    将区间 (start, end) 从源基因组转换到目标基因组
+    返回: (new_chrom, new_start, new_end) 或 None
+    """
+    # 注意：pyliftover 使用 1-based 坐标
+    # 如果你的 BED 是 0-based（如 BED 文件），start 需要 +1
+    # 这里假设输入是 1-based；如果是 0-based，请用 start+1, end
+    result_start = lo.convert_coordinate(chrom, start)
+    result_end = lo.convert_coordinate(chrom, end)
+
+    if not result_start or not result_end:
+        return None  # 无法转换
+
+    # 取置信度最高的映射
+    best_start = max(result_start, key=lambda x: x[3])
+    best_end = max(result_end, key=lambda x: x[3])
+
+    new_chrom = best_start[0]
+    new_start = best_start[1]
+    new_end = best_end[1]
+
+    # 确保 start <= end
+    if new_start >= new_end:
+        new_start, new_end = new_end, new_start
+    return new_chrom, new_start, new_end
+
+
+def get_seq(path, coord):
+    import pysam
+
+    # 打开 FASTA 文件
+    fasta = pysam.FastaFile(path)  # 自动读取 genome.fa.fai
+
+    # 提取序列：chr1:1000000-1000100（0-based, [start, end)）
+    # 注意：pysam 使用 0-based 坐标，与 BED 一致
+    return fasta.fetch(region=f"{coord[0]}:{coord[1]}-{coord[2]}")
+
+
+if __name__ == "__main__":
+    pass
--- a/design/src/mutation.py
+++ b/design/src/mutation.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import itertools
+
+from Bio.Seq import Seq
+from loguru import logger
+
+# 遗传密码表：DNA -> 氨基酸
+codon_table = {
+    "TTT": "F",
+    "TTC": "F",
+    "TTA": "L",
+    "TTG": "L",
+    "TCT": "S",
+    "TCC": "S",
+    "TCA": "S",
+    "TCG": "S",
+    "TAT": "Y",
+    "TAC": "Y",
+    "TAA": "*",
+    "TAG": "*",
+    "TGT": "C",
+    "TGC": "C",
+    "TGA": "*",
+    "TGG": "W",
+    "CTT": "L",
+    "CTC": "L",
+    "CTA": "L",
+    "CTG": "L",
+    "CCT": "P",
+    "CCC": "P",
+    "CCA": "P",
+    "CCG": "P",
+    "CAT": "H",
+    "CAC": "H",
+    "CAA": "Q",
+    "CAG": "Q",
+    "CGT": "R",
+    "CGC": "R",
+    "CGA": "R",
+    "CGG": "R",
+    "ATT": "I",
+    "ATC": "I",
+    "ATA": "I",
+    "ATG": "M",
+    "ACT": "T",
+    "ACC": "T",
+    "ACA": "T",
+    "ACG": "T",
+    "AAT": "N",
+    "AAC": "N",
+    "AAA": "K",
+    "AAG": "K",
+    "AGT": "S",
+    "AGC": "S",
+    "AGA": "R",
+    "AGG": "R",
+    "GTT": "V",
+    "GTC": "V",
+    "GTA": "V",
+    "GTG": "V",
+    "GCT": "A",
+    "GCC": "A",
+    "GCA": "A",
+    "GCG": "A",
+    "GAT": "D",
+    "GAC": "D",
+    "GAA": "E",
+    "GAG": "E",
+    "GGT": "G",
+    "GGC": "G",
+    "GGA": "G",
+    "GGG": "G",
+}
+
+# 反向查找：氨基酸 -> 密码子列表
+aa_to_codons = {}
+for codon, aa in codon_table.items():
+    if aa not in aa_to_codons:
+        aa_to_codons[aa] = []
+    aa_to_codons[aa].append(codon)
+
+# 碱基
+bases = ["A", "T", "G", "C"]
+
+
+def generate_nnn():
+    """生成所有 NNN 组合（64种）"""
+    return ["".join(c) for c in itertools.product(bases, repeat=3)]
+
+
+def generate_2n_variants(original_codon, fixed_position=None):
+    """
+    生成双N突变（两个位置随机，一个固定）
+    fixed_position: 0,1,2 表示哪个位置保持不变（0=第一个碱基）
+    若不指定，则生成所有三种模式：NNT, NTN, TNN
+    """
+    variants = set()
+    for pos in [fixed_position] if fixed_position is not None else [0, 1, 2]:
+        fixed_base = original_codon[pos]
+        for b1 in bases:
+            for b2 in bases:
+                codon_list = ["_", "_", "_"]
+                codon_list[pos] = fixed_base
+                idx = 0
+                for i in range(3):
+                    if i != pos:
+                        codon_list[i] = [b1, b2][idx]
+                        idx += 1
+                variant = "".join(codon_list)
+                variants.add(variant)
+    return sorted(variants)
+
+
+def generate_1n_variants(original_codon, fixed_positions=None):
+    """
+    生成单N突变（一个位置随机，两个固定）
+    fixed_positions: 如 [0,1] 表示第0和第1位固定
+    若不指定，则生成所有三种模式：ANT, ATN, TAN
+    """
+    variants = set()
+    if fixed_positions:
+        positions = [fixed_positions]
+    else:
+        positions = [[0, 1], [0, 2], [1, 2]]
+
+    for fix in positions:
+        var_pos = 3 - sum(fix)  # 剩下那个位置是变量
+        for i in range(3):
+            if i not in fix:
+                var_pos = i
+                break
+        base1, base2 = original_codon[fix[0]], original_codon[fix[1]]
+        for b in bases:
+            codon_list = ["_", "_", "_"]
+            codon_list[fix[0]] = base1
+            codon_list[fix[1]] = base2
+            codon_list[var_pos] = b
+            variant = "".join(codon_list)
+            variants.add(variant)
+    return sorted(variants)
+
+
+def translate(codon):
+    return codon_table.get(codon, "X")
+
+
+def design_mutations_for_orf(dna_seq, strategy="3N"):
+    """
+    对整个 ORF 序列进行饱和突变设计
+    strategy: '3N', '2N', '1N'
+    """
+    if len(dna_seq) % 3 != 0:
+        raise ValueError(f"ORF 长度必须是 3 的倍数！{dna_seq}")
+
+    num_codons = len(dna_seq) // 3
+    results = []
+
+    for i in range(num_codons):
+        start = i * 3
+        end = start + 3
+        orig_codon = dna_seq[start:end]
+        orig_aa = translate(orig_codon)
+
+        logger.debug(
+            f"\n--- 位点 {i + 1} (氨基酸 {i + 1}): {orig_aa} ({orig_codon}) ---"
+        )
+
+        variants = []
+        if strategy == "3N":
+            variants = generate_nnn()
+            logger.debug(f"策略: 3N (NNN) → 共 {len(variants)} 种组合")
+        elif strategy == "2N":
+            variants = generate_2n_variants(orig_codon)
+            logger.debug(f"策略: 2N (任意两个随机) → 共 {len(variants)} 种组合")
+        elif strategy == "1N":
+            variants = generate_1n_variants(orig_codon)
+            logger.debug(f"策略: 1N (任意一个随机) → 共 {len(variants)} 种组合")
+        else:
+            raise ValueError("strategy 必须是 '3N', '2N', 或 '1N'")
+
+        # 过滤掉无效密码子（理论上不会）
+        valid_variants = [v for v in variants if len(v) == 3]
+
+        # 统计突变结果
+        mutant_aa_count = {}
+        stop_count = 0
+        for v in valid_variants:
+            aa = translate(v)
+            if aa == "*":
+                stop_count += 1
+            mutant_aa_count[aa] = mutant_aa_count.get(aa, 0) + 1
+
+        logger.debug(f"→ 共产生 {len(valid_variants)} 个有效突变")
+        logger.debug(f"→ 可产生 {len(mutant_aa_count)} 种不同氨基酸（含终止）")
+        logger.debug(f"→ 引入终止密码子: {stop_count} 次")
+        logger.debug(f"→ 氨基酸分布: {mutant_aa_count}")
+
+        results.append(
+            {
+                "position": i + 1,
+                "original_codon": orig_codon,
+                "original_aa": orig_aa,
+                "variants": valid_variants,
+                "variant_count": len(valid_variants),
+                "mutant_aa_count": mutant_aa_count,
+                "stop_count": stop_count,
+            }
+        )
+
+    return results
+
+
+if __name__ == "__main__":
+    pass
--- a/design/src/reader.py
+++ b/design/src/reader.py
@@ -0,0 +1,236 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+从gtf中读取CDS，并读取对应的sequence
+"""
+
+import gzip
+
+import pandas as pd
+from Bio.Seq import Seq
+from loguru import logger
+from pyfaidx import Fasta
+
+
+class Region(object):
+    """记录坐标位点，用于融合"""
+
+    def __init__(self, chrom, start, end, strand="+", kind=None):
+        self.chrom = chrom
+        self.start = start
+        self.end = end
+        self.strand = strand
+        self.sequence = None
+        self.kind_ = kind
+        self.addition = None
+
+    @classmethod
+    def create(cls, region):
+        chrom, sites = region.split(":")
+        sites = [int(x) for x in sites.split("-")]
+        return cls(chrom, sites[0], sites[-1], "+")
+
+    def set_seq(self, sequence: str):
+        self.sequence = sequence.upper()
+
+        # 如果是负链，需要反向互补
+        if self.strand == "-":
+            self.sequence = str(Seq(sequence).reverse_complement())
+
+    def __and__(self, other):
+        if self.chrom != other.chrom:
+            return False
+
+        return self.start < other.end and self.end > other.start
+
+    def __add__(self, other):
+        if not self & other:
+            raise ValueError("没有重合位点")
+
+        self.start = min(self.start, other.start)
+        self.end = max(self.end, other.end)
+        return self
+
+    def __str__(self) -> str:
+        return f"{self.chrom}:{self.start}-{self.end}"
+
+    def __hash__(self):
+        return hash(str(self))
+
+    def __len__(self):
+        return self.end - self.start
+
+    @property
+    def kind(self):
+        if len(self) >= 3:
+            return self.kind_
+        if not self.kind_:
+            return ""
+        else:
+            return f"cross_{self.kind_}"
+
+
+def read_gtf(gtf_path):
+    """
+    读取 GTF 文件，返回 DataFrame
+    """
+    logger.info("正在读取 GTF 文件...")
+    columns = [
+        "seqname",
+        "source",
+        "feature",
+        "start",
+        "end",
+        "score",
+        "strand",
+        "frame",
+        "attribute",
+    ]
+
+    df = pd.read_csv(
+        gtf_path, sep="\t", comment="#", header=None, names=columns, low_memory=False
+    )
+
+    # 过滤出 CDS 行
+    cds_df = df[df["feature"] == "CDS"].copy()
+
+    # 解析 attribute 列，展开为多个列
+    # 使用 pd.json_normalize 将字典列表转换为 DataFrame
+    try:
+        attributes_df = pd.json_normalize(cds_df["attribute"].apply(parse_attributes))
+    except Exception as e:
+        logger.error(f"解析 attribute 字段失败: {e}")
+        raise
+
+    # 将原始列与解析后的属性列合并
+    result_df = pd.concat([cds_df.reset_index(drop=True), attributes_df], axis=1)
+
+    logger.info(f"成功读取并解析 GTF 文件，共 {len(result_df)} 个 CDS 特征。")
+    return result_df
+
+
+def parse_attributes(attr_str):
+    """
+    解析 GTF 的 attribute 字段，返回字典
+    """
+    attributes = {}
+    for item in attr_str.split(";"):
+        item = item.strip()
+        if not item:
+            continue
+        if " " in item:
+            key, value = item.split(" ", 1)
+            attributes[key] = value.strip('"')
+    return attributes
+
+
+def get_cds_for_gene(cds_df, gene_name):
+    """
+    提取指定基因的所有 CDS 条目，并按转录本分组，选择最长的转录本
+    """
+    logger.info(f"正在查找基因 '{gene_name}' 的 CDS...")
+
+    # 添加解析后的属性
+    cds_df["attributes_parsed"] = cds_df["attribute"].apply(parse_attributes)
+
+    # 筛选包含该基因名的行
+    gene_cds_list = []
+    for idx, row in cds_df.iterrows():
+        attrs = row["attributes_parsed"]
+        if attrs.get("transcript_id") == gene_name:
+            # if attrs.get('gene_name') == gene_name or attrs.get('gene_id').startswith(gene_name):
+            gene_cds_list.append(row)
+
+    if not gene_cds_list:
+        raise ValueError(f"未在 GTF 中找到基因 '{gene_name}'")
+
+    df = pd.DataFrame(gene_cds_list)
+    df = df[
+        ["seqname", "feature", "start", "end", "strand", "transcript_id"]
+    ].drop_duplicates()
+
+    res = []
+    last = None
+    for _, row in df.iterrows():
+        temp = Region(
+            str(row["seqname"]),
+            row["start"],
+            row["end"],
+            str(row["strand"]),
+            row["transcript_id"],
+        )
+        if last is None:
+            last = temp
+        elif temp & last:
+            last = last + temp
+        else:
+            res.append(last)
+            last = temp
+    if last not in res:
+        res.append(last)
+
+    return res
+
+
+def load_uniprot_region(path):
+    res = []
+    last = None
+    with open(path) as r:
+        for line in r:
+            if line.startswith("#"):
+                continue
+            temp = Region.create(line.split()[0])
+            if last is None:
+                last = temp
+            elif temp & last:
+                last = last + temp
+            else:
+                res.append(last)
+                last = temp
+
+    if last not in res:
+        res.append(last)
+    return res
+
+
+def extract_orf_sequence(genome_fasta, cds_rows, half_open=False):
+    """
+    从参考基因组中提取 CDS 并拼接成 ORF
+    """
+
+    if not cds_rows:
+        raise ValueError("not cds")
+
+    seqname = cds_rows[0].chrom
+    strand = cds_rows[0].strand
+
+    logger.debug(f"从参考基因组提取序列 (chr{seqname})...")
+    genome = Fasta(genome_fasta)
+
+    # 获取染色体序列
+    try:
+        chrom_seq = genome[seqname]  # 如 "chr1", "1" 等，根据 FASTA 命名调整
+    except KeyError:
+        if "chr" in seqname:
+            seqname = seqname.replace("chr", "")
+        else:
+            seqname = "chr" + seqname
+        chrom_seq = genome[seqname]  # 如 "chr1", "1" 等，根据 FASTA 命名调整
+
+    for row in cds_rows:
+        start = int(row.start) - 1  # GTF 是 1-based，pyfaidx 是 0-based
+        end = int(row.end) - (1 if half_open else 0)
+
+        if len(row) < 3 and "cross" in row.kind:
+            if row.kind == "cross_start":
+                start = end - 3
+            else:
+                end = start + 3
+
+        row.set_seq(chrom_seq[start:end].seq)
+
+    return cds_rows
+
+
+if __name__ == "__main__":
+    pass
--- a/design/src/safe_target.py
+++ b/design/src/safe_target.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+生成safe targeting 序列
+"""
+
+import pandas as pd
+import random
+import pysam
+
+from tqdm import tqdm
+
+
+seed = 42
+random.seed(42)
+
+
+__AAs__ = {
+    "丙氨酸": ["GCU", "GCC", "GCA", "GCG"],
+    "精氨酸": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"],
+    "天冬酰胺": ["AAU", "AAC"],
+    "天冬氨酸": ["GAU", "GAC"],
+    "半胱氨酸": ["UGU", "UGC"],
+    "谷氨酰胺": ["CAA", "CAG"],
+    "谷氨酸": ["GAA", "GAG"],
+    "甘氨酸": ["GGU", "GGC", "GGA", "GGG"],
+    "组氨酸": ["CAU", "CAC"],
+    "异亮氨酸": ["AUU", "AUC", "AUA"],
+    "亮氨酸": ["UUA", "UUG", "CUU", "CUC", "CUA", "CUG"],
+    "赖氨酸": ["AAA", "AAG"],
+    "甲硫氨酸": ["AUG"],
+    "苯丙氨酸": ["UUU", "UUC"],
+    "脯氨酸": ["CCU", "CCC", "CCA", "CCG"],
+    "丝氨酸": ["UCU", "UCC", "UCA", "UCG", "AGU", "AGC"],
+    "苏氨酸": ["ACU", "ACC", "ACA", "ACG"],
+    "色氨酸": ["UGG"],
+    "酪氨酸": ["UAU", "UAC"],
+    "缬氨酸": ["GUU", "GUC", "GUA", "GUG"],
+    "终止密码子": ["UAA", "UAG", "UGA"],
+}
+
+
+def codons():
+    for key, values in __AAs__.items():
+        yield key, [value.replace("U", "T") for value in values]
+
+
+class Region:
+    """Represents a 3bp codon region in the full sequence."""
+
+    def __init__(self, chrom: str, start: int, end: int):
+        self.chrom = chrom
+        self.start = start
+        self.end = end
+        self.__shift__ = 0
+
+    def __str__(self):
+        return f"{self.chrom}:{self.start}-{self.end}"
+
+    def shift(self, fasta: str):
+        for i in range(0, self.end - self.start):
+            if self.__shift__ != 0:
+                break
+            seq = extract_fastq_seq(fasta, Region(self.chrom, self.start+i, self.start+i+3))
+
+            for _, values in codons():
+                if seq in values:
+                    self.__shift__ = i
+                    break
+
+    def choose(self, number: int = 3):
+        length_of_codon = 3
+
+        regions = []
+        for i in range(self.start + self.__shift__, self.end, length_of_codon):
+            if i + length_of_codon > self.end:
+                break
+            regions.append([i, i + length_of_codon])
+
+        # np.choice(my_list, size=3, replace=False)
+
+        if number > len(regions):
+            return [Region(self.chrom, x[0], x[1]) for x in regions]
+
+        return [Region(self.chrom, x[0], x[1]) for x in random.sample(regions, number)]
+
+
+def extract_fastq_seq(fastq: str, region: Region, seq_len: int = 100):
+    with pysam.FastaFile(fastq) as fh:
+        rec = fh.fetch(region.chrom, region.start, region.end)
+        # print(rec)
+        return rec
+
+
+def mutation(seq: str):
+    random.seed(seed)
+    for key, value in __AAs__.items():
+        if seq in value:
+            random_keys = random.sample([x for x in __AAs__.keys() if x != key], 1)[0]
+            return random.sample(__AAs__[random_keys], 1)[0].replace("U", "T")
+
+
+
+def main(infile, outfile, reference = "../ref/UCSC/hg19.fa.gz", seq_len: int = 100):
+
+    meta = pd.read_excel(infile, sheet_name="Human Safe Regions", header=None)
+    meta = meta.sample(n=2000, random_state=seed)
+
+    data = []
+    for idx in tqdm(meta.iloc[:, 0], total=meta.shape[0]):
+
+        idx = idx.split(";")
+        region = Region(idx[0], int(idx[1]), int(idx[2]))
+        region.shift(reference)
+
+        regions = region.choose(5)
+
+        for reg in regions:
+            seq = extract_fastq_seq(reference, reg)
+            mut = mutation(seq)
+
+            if seq is None or mut is None:
+                continue
+
+            key = str(reg) + "_" + seq + "_" + mut
+            before = extract_fastq_seq(reference, Region(region.chrom, reg.start - seq_len, reg.start))
+            after = extract_fastq_seq(reference, Region(region.chrom, reg.end, reg.end + seq_len))
+
+            seq = f"{before}({seq}/{mut}){after}"
+            data.append({"sequence_name": key, "editseq": seq})
+
+    data = pd.DataFrame(data)
+    data.to_csv(outfile, index=False)
+
+
+if __name__ == "__main__":
+    from fire import Fire
+    Fire(main)
+
--- a/design/src/snp.py
+++ b/design/src/snp.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""用来解析snp错配信息"""
+
+import re
+from itertools import product
+
+
+def generate_sequences_with_combinations(seq):
+    """
+    将 DNA 序列中连续的 N 替换为所有可能的 A/T/C/G 组合，
+    返回所有可能的序列列表。
+
+    参数:
+        seq (str): 输入的 DNA 序列，可包含 N
+
+    返回:
+        list: 所有可能的序列（字符串列表）
+    """
+    if "N" not in seq:
+        return [seq]
+
+    # 分割序列，保留分隔符信息
+    segments = []
+    i = 0
+    while i < len(seq):
+        if seq[i] == "N":
+            j = i
+            while j < len(seq) and seq[j] == "N":
+                j += 1
+            length = j - i
+            segments.append(("N", length))  # ('N', 3) 表示连续3个N
+            i = j
+        else:
+            j = i
+            while j < len(seq) and seq[j] != "N":
+                j += 1
+            segments.append(("seq", seq[i:j]))
+            i = j
+
+    # 提取每个 N 块的可能组合
+    n_block_options = []
+    for seg_type, content in segments:
+        if seg_type == "N":
+            # 生成所有长度为 content 的 ATCG 组合
+            options = ["".join(p) for p in product("ATCG", repeat=content)]
+            n_block_options.append(options)
+
+    # 如果没有 N，直接返回原序列
+    if not n_block_options:
+        return [seq]
+
+    # 使用 itertools.product 生成所有组合
+    from itertools import product as iter_product
+
+    all_combinations = list(iter_product(*n_block_options))
+
+    # 构建所有可能的序列
+    results = []
+    for combo in all_combinations:
+        new_seq = ""
+        n_index = 0
+        for seg_type, content in segments:
+            if seg_type == "seq":
+                new_seq += content
+            elif seg_type == "N":
+                new_seq += combo[n_index]
+                n_index += 1
+        results.append(new_seq)
+
+    return results
+
+
+def decode_snp(label, ref_start=0):
+    if label is None:
+        return ""
+
+    if ":" in label:
+        label = label.split(":")[-1]
+
+    if ref_start <= 0 and not label.startswith("g."):
+        raise ValueError(f"{label} not genomic label")
+    elif ref_start > 0 and not label.startswith("c."):
+        raise ValueError(f"{label} not cdna label")
+
+    label = re.sub(r"([cg]\.|\[\d+\])", "", label)
+
+    sites = []
+
+    for x in label.split("_"):
+        if not x:
+            continue
+
+        x = re.sub(r"[^\d\+-]", "", x)
+        if "+" in x:
+            x = [int(y) for y in x.split("+")]
+            x = x[0] + x[-1]
+        elif "-" in x:
+            x = [int(y) for y in x.split("-")]
+            x = x[0] + x[-1]
+        else:
+            x = int(x)
+
+        sites.append(x + ref_start)
+
+    sites = sorted(sites)
+
+    rule = re.sub(r"[\d_\+-]", "", label)
+    return sites, rule.strip()
+
+
+if __name__ == "__main__":
+    pass
+
--- a/design/src/snv-N-2N-3N.py
+++ b/design/src/snv-N-2N-3N.py
@@ -0,0 +1,263 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import os
+import re
+import sys
+import itertools
+import random
+import gzip
+from typing import List, Dict, Any, Optional
+
+import pandas as pd
+from loguru import logger
+
+#  ps
+# 将目标序列按3bp密码子分割，每个密码子系统性地生成三类突变
+# 为每个突变自动附加100bp的上下游侧翼序列，每类随机抽取150个突变
+
+# Mutation design constants
+NUCLEOTIDES = ["A", "T", "C", "G"]
+UPSTREAM_LEN = 100
+DOWNSTREAM_LEN = 100
+TARGET_MUTATIONS = 150
+
+
+class Region:
+    """Represents a 3bp codon region in the full sequence."""
+
+    def __init__(
+        self, chrom: str, start: int, end: int, sequence: str, absolute_index: int
+    ):
+        self.chrom = chrom
+        self.start = start
+        self.end = end
+        self.sequence = sequence.upper()
+        self.absolute_index = absolute_index
+
+
+def read_fasta(fasta_path: str) -> Dict[str, str]:
+    """Parses FASTA file, returning {header: sequence}."""
+    sequences = {}
+    current_header: Optional[str] = None
+    opener = gzip.open if fasta_path.endswith(".gz") else open
+
+    if not os.path.exists(fasta_path):
+        logger.error(f"FASTA file not found: {fasta_path}")
+        return {}
+
+    with opener(fasta_path, "rt") as f:
+        current_seq: List[str] = []
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+
+            if line.startswith(">"):
+                if current_header and current_seq:
+                    sequences[current_header] = (
+                        "".join(current_seq).upper().replace("U", "T")
+                    )
+
+                current_header = line[1:].split()[0]
+                current_seq = []
+            else:
+                if current_header:
+                    current_seq.append(line)
+
+        if current_header and current_seq:
+            sequences[current_header] = "".join(current_seq).upper().replace("U", "T")
+
+    return sequences
+
+
+def split_sequence_to_codons(full_seq: str, gene_name: str) -> List[Region]:
+    """Splits full sequence into 3bp Region objects."""
+    regions: List[Region] = []
+
+    for i in range(0, len(full_seq), 3):
+        codon = full_seq[i : i + 3]
+        if len(codon) == 3:
+            regions.append(Region(gene_name, i, i + 2, codon, i))
+    return regions
+
+
+def analyze_variant(ref: str, alt: str) -> str:
+    """Simplifies substitution representation (e.g., 'CAT'->'CGT' becomes 'C(A/G)T')."""
+    if len(ref) != len(alt):
+        return f"({ref}/{alt})"
+
+    diffs = [
+        {"index": i, "ref_base": ref[i], "alt_base": alt[i]}
+        for i in range(len(ref))
+        if ref[i] != alt[i]
+    ]
+
+    if not diffs:
+        return ref
+
+    positions = [d["index"] for d in diffs]
+    is_consecutive = all(
+        positions[i + 1] - positions[i] == 1 for i in range(len(positions) - 1)
+    )
+
+    # Rule 1: Continuous differences (e.g., 'CAT'->'CGC' becomes 'C(AT/GC)')
+    if is_consecutive:
+        start_pos, end_pos = positions[0], positions[-1] + 1
+        return f"{ref[:start_pos]}({ref[start_pos:end_pos]}/{alt[start_pos:end_pos]}){ref[end_pos:]}"
+
+    # Rule 2: Intermittent differences (e.g., 'GTT'->'GCG' becomes 'G(T/C)(T/G)')
+    out = []
+    prev_end = 0
+    for d in diffs:
+        pos, r, a = d["index"], d["ref_base"], d["alt_base"]
+        out.append(ref[prev_end:pos])
+        out.append(f"({r}/{a})")
+        prev_end = pos + 1
+    out.append(ref[prev_end:])
+
+    return "".join(out)
+
+
+def generate_codon_mutations(original_codon: str, n_mutations: int) -> List[str]:
+    """Generates all codon variants with exactly n_mutations."""
+    mutants = set()
+    codon_length = len(original_codon)
+
+    for indices in itertools.combinations(range(codon_length), n_mutations):
+        base_options: List[List[str]] = []
+        for i in range(codon_length):
+            if i in indices:
+                options = [b for b in NUCLEOTIDES if b != original_codon[i]]
+            else:
+                options = [original_codon[i]]
+            base_options.append(options)
+
+        for combination in itertools.product(*base_options):
+            mutant_codon = "".join(combination)
+            if mutant_codon != original_codon:
+                mutants.add(mutant_codon)
+
+    return sorted(list(mutants))
+
+
+def generate_editseq_and_metadata(
+    full_seq: str, regions: List[Region], gene_name: str
+) -> pd.DataFrame:
+    """Generates all mutations (1N, 2N, 3N) and constructs the final DataFrame."""
+    df_list: List[Dict[str, str]] = []
+
+    for idx, x in enumerate(regions):
+        abs_start = x.absolute_index
+        original_codon = x.sequence
+
+        # 1. Extract flanking sequences
+        flank_up = full_seq[max(0, abs_start - UPSTREAM_LEN) : abs_start]
+        flank_down = full_seq[
+            abs_start + 3 : min(len(full_seq), abs_start + 3 + DOWNSTREAM_LEN)
+        ]
+
+        for strategy, n_mut in [("3N", 3), ("2N", 2), ("1N", 1)]:
+            variants = generate_codon_mutations(original_codon, n_mut)
+
+            for mutation_codon in variants:
+                # Use analyze_variant to simplify representation
+                simplified_codon = analyze_variant(original_codon, mutation_codon)
+
+                # sequence_name: GENE_SUB_STRATEGY_AAINDEX_ORIGINAL>MUTATION
+                seq_name = f"{gene_name}_SUB_{strategy}_AA{idx + 1}_{original_codon}>{mutation_codon}"
+
+                # editseq: flank_up + simplified_codon + flank_down
+                edit_seq = f"{flank_up}{simplified_codon}{flank_down}"
+
+                df_list.append(
+                    {
+                        "sequence_name": seq_name,
+                        "editseq": edit_seq,
+                        "strategy": strategy,
+                        "mutation_type": "REPL",  # Replacement
+                    }
+                )
+
+    return pd.DataFrame(df_list)
+
+
+def run_mutation_design(fasta_file: str, gene_name: str, output_base_name: str):
+    """Executes the mutation design pipeline and saves 3 separate files."""
+
+    logger.info(f"Targeting gene: {gene_name}")
+    fasta_data = read_fasta(fasta_file)
+    full_seq, target_id = "", ""
+
+    # Locate target sequence
+    for seq_id, seq in fasta_data.items():
+        if gene_name.upper() in seq_id.upper():
+            full_seq = seq
+            target_id = seq_id
+            break
+
+    if not full_seq and fasta_data:
+        # Fallback: use longest sequence
+        target_id, full_seq = max(fasta_data.items(), key=lambda item: len(item[1]))
+        if full_seq:
+            logger.warning(
+                f"Using longest sequence ID: {target_id} (Length: {len(full_seq)} bp)"
+            )
+
+    if not full_seq:
+        logger.error(f"Failed to extract target sequence.")
+        return
+
+    logger.info(f"Target sequence ID: {target_id}, Length: {len(full_seq)} bp")
+
+    # 1. Generate ALL mutations (1N, 2N, 3N)
+    cds_regions = split_sequence_to_codons(full_seq, gene_name)
+    all_mutations_df = generate_editseq_and_metadata(full_seq, cds_regions, gene_name)
+
+    # 2. Process and save
+    strategies = ["1N", "2N", "3N"]
+
+    for strategy in strategies:
+        # Filter for the current strategy
+        strategy_df = all_mutations_df[all_mutations_df["strategy"] == strategy].copy()
+        original_count = len(strategy_df)
+
+        # Determine output file name (e.g., AAVS1_1N_150_mutations.csv)
+        output_file_name = output_base_name.replace("{strategy}", strategy)
+
+        if original_count == 0:
+            logger.warning(
+                f"Strategy {strategy}: No mutations generated. Skipping file creation for {output_file_name}."
+            )
+            continue
+
+        # Random sampling for the current strategy
+        if original_count > TARGET_MUTATIONS:
+            final_df = strategy_df.sample(n=TARGET_MUTATIONS, random_state=42)
+            logger.success(
+                f"Strategy {strategy}: Sampled {TARGET_MUTATIONS} mutations from {original_count} designs."
+            )
+        else:
+            final_df = strategy_df
+            logger.warning(
+                f"Strategy {strategy}: Generated {original_count} mutations; saving all."
+            )
+
+        # Save result, ensuring column order
+        final_df[["sequence_name", "editseq", "strategy", "mutation_type"]].to_csv(
+            output_file_name, index=False
+        )
+        logger.success(f"Strategy {strategy}: Design saved to {output_file_name}.")
+
+
+if __name__ == "__main__":
+    AAVS1_FASTA_PATH = (
+        "/rawdata1/project/peRNA_design/ref/AAVS1/ncbi_dataset/data/rna.fna"
+    )
+    GENE_NAME = "AAVS1"
+    OUTPUT_BASE_NAME = "AAVS1_{strategy}_150_mutations.csv"
+
+    run_mutation_design(
+        fasta_file=AAVS1_FASTA_PATH,
+        gene_name=GENE_NAME,
+        output_base_name=OUTPUT_BASE_NAME,
+    )