提交饱和编辑的相关设计,及检验代码

This commit is contained in:
2026-02-26 14:02:42 +08:00
commit cb556b47c0
36 changed files with 5437 additions and 0 deletions

450
design/main.py Normal file
View File

@@ -0,0 +1,450 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import math
import os
import re
import sys
from glob import glob
import requests as rq
import pandas as pd
from loguru import logger
import numpy as np
from src.mutation import design_mutations_for_orf
from src.reader import (extract_orf_sequence, get_cds_for_gene,
load_uniprot_region, read_gtf, Region)
from src.liftover import convert_interval
from src.snp import decode_snp, generate_sequences_with_combinations
import itertools
from src.editseq import run_analysis
# 清除默认的 handler
logger.remove()
# 添加一个只输出 INFO 及以上级别日志的 sink如控制台
# logger.add(level="INFO")
logger.add(
sys.stderr,
colorize=True,
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>",
# <cyan>{name}</cyan>: <cyan>{function}</cyan>: <cyan>{line}</cyan>
level="INFO"
)
def split_regions(cds):
u"""
切分原本的cds为3bp的氨基酸reigon
测试用例
14:103698801-103699017
14:103699133-103699179
14:103699364-103699576
14:103703173-103703327
14:103707003-103707215
14:103708522-103708659
14:103711033-103711087
"""
regions = []
cds = sorted(cds, key=lambda x: (x.chrom, x.start, x.end))
aa_codon_len = 3
start = 0
for x in cds:
# 如果start为0则直接从目前的区域开始
if start == 0:
start = x.start
elif start < 0:
# 如果start为负值说明上一个cds并不能完整划分为不同的aa
# 因此,需要单独将起始的小区域单独写出来
regions.append(Region(x.chrom, x.start, x.start - start, kind="start"))
regions[-1].addition = x
start = x.start - start
while start + aa_codon_len <= x.end:
# 记录下是否跨边界,以及跨的是哪一个边界
code = "regular"
if start == x.start:
code = "start"
elif start + aa_codon_len == x.end:
code = "end"
regions.append(Region(x.chrom, start, start + aa_codon_len, kind=code))
regions[-1].addition = x
start += aa_codon_len
if start < x.end:
# 如果是跨到end边界上了那么就记录跨的边界
regions.append(Region(x.chrom, start, x.end, kind="end"))
regions[-1].addition = x
start = start - x.end + 1
else:
# 如果没有则把start的指针归零
start = 0
return regions
def download_uniprot_region(protein, output):
resp = output.replace(".tsv", ".json")
url = f"https://www.ebi.ac.uk/proteins/api/coordinates?accession={protein}"
if os.path.exists(resp):
with open(resp, "r") as r:
resp = json.load(r)
else:
resp = rq.get(url, headers={"Accept": "application/json"})
resp = resp.json()
with open(output.replace(".tsv", ".json"), "w+") as w:
json.dump(resp, w, indent=4)
if not resp[0]["name"].endswith("HUMAN"):
raise ValueError(f"protein is not human")
__chroms__ = [str(x) for x in range(1, 23)] + ["chr" + str(x) for x in range(1, 23)] + ["X", "Y", "chrX", "chrY"]
with open(output, "w+") as w:
w.write(f"#{url}\n")
for coord in resp[0]["gnCoordinate"]:
chromosome = coord["genomicLocation"]["chromosome"]
if chromosome not in __chroms__:
continue
for row in coord["genomicLocation"]["exon"]:
genome = row["genomeLocation"]
genome = str(genome["begin"]["position"]) + "-" + str(genome["end"]["position"])
protein = row["proteinLocation"]
if "end" not in protein and "position" in protein:
protein = [str(protein["position"]["position"]), "-", str(protein["position"]["position"])]
else:
protein = [str(protein["begin"]["position"]), "-", str(protein["end"]["position"])]
row = f"{chromosome}:{genome}\t{'\t'.join(protein)}"
w.write(row + "\n")
break
def get_aa_coords(genes, output):
os.makedirs(output, exist_ok=True)
df = pd.read_excel(genes)
# df = df.loc[df["Batch"] == 1, :]
for _, row in df.iterrows():
gene_name = row[1]
url = f"https://rest.uniprot.org/uniprotkb/search?query=gene:{gene_name}+AND+organism_id:9606+AND+reviewed:true&format=json"
resp = rq.get(url)
for row in resp.json().get("results", []):
if "HUMAN" in row["uniProtkbId"]:
priority = row["primaryAccession"]
download_uniprot_region(priority, os.path.join(output, f"{gene_name}_{priority}.tsv"))
break
def adjust_cross_border_region(row):
start = row.start
end = row.end
if len(row) < 3 and "cross" in row.kind:
if row.kind == "cross_start":
start = end - 3
else:
end = start + 3
return f"{row.chrom}:{start}-{end}"
return str(row)
def design_by_aa(genes, fasta, output, stop_codon = False):
u""" 根据氨基酸设计错配的位点和错配规则 """
df = []
for gene in glob(os.path.join(genes, "*.tsv")):
logger.info(f"开始设计突变 {gene}...")
key = os.path.basename(gene).split(".")[0]
# 读取已有的区域
cds = load_uniprot_region(gene)
# 按照氨基酸位置划分
cds = split_regions(cds)
if not cds:
continue
# 提取序列
cds = extract_orf_sequence(fasta, cds, half_open=True)
for idx, x in enumerate(cds):
for strategy in ["3N"]:
results = design_mutations_for_orf(x.sequence, strategy=strategy)
for res in results:
for var in res["variants"]:
if var == res["original_codon"]:
continue
# 如果1个bp在起点则说明2bp在前边end该位点的前2bp必须与记录的内含子相同
if "cross_start" == x.kind and len(x) == 1 and var[:2] != x.sequence[:2]:
continue
# 如果2bp在起点则说明1bp在前边end则该位点的第一个碱基必须为内含子相同位点
elif "cross_start" == x.kind and len(x) == 2 and var[0] != x.sequence[0]:
continue
# 如果1bp在end则说明2bp在后边则该位点的2bp位点必须与内含子相同
elif "cross_end" == x.kind and len(x) == 1 and var[1:] != x.sequence[1:]:
continue
# 如果1bp在end则说明1bp在后边则该位点的1bp位点必须为C或G
elif "cross_end" == x.kind and len(x) == 2 and var[-1] not in ["C", "G"]:
continue
row = [key, str(x.addition), idx+1, str(x), adjust_cross_border_region(x), x.kind, strategy, res["original_codon"], var]
df.append(row)
df = pd.DataFrame(df)
df.columns = ["gene", "cds_region", "aa_index", "aa_region", "region_with_intron", "cross_cds_border", "strategy",
"origial_code", "mutation_code"]
strategy = []
for _, row in df.iterrows():
match = np.sum([x == y for x, y in zip(row["origial_code"], row["mutation_code"])])
strategy.append(f"{3-match}N")
df["strategy"] = strategy
if stop_codon:
df = df[df["mutation_code"].isin(["TAA", "TAG", "TGA"])]
df.to_csv(output, index = False)
def design_by_snp(snp_info, targets, genes, fasta, fasta_hg38, output):
logger.info("读取染色体")
chroms = {}
starts = {}
for gene in glob(os.path.join(genes, "*.tsv")):
key = os.path.basename(gene).split(".")[0]
cds = load_uniprot_region(gene)
cds = sorted(cds, key=lambda x:[x.chrom, x.start, x.end])
chroms[key] = cds[0].chrom
starts[key] = cds[0].start
logger.info(f"读取snp的信息{snp_info}")
all_sheets = pd.read_excel(snp_info, sheet_name=None)
# 遍历所有工作表
res = {}
for sheet_name, df in all_sheets.items():
temp = {}
for _, row in df.iterrows():
cdna = row["DNA change (cDNA) "]
hg38 = row["DNA change (genomic) (hg19)     "]
temp[cdna] = hg38
for sheet in re.split(r"[\(\s\)]", sheet_name):
res[sheet] = temp
print(res.keys())
logger.info(f"读取目标:{targets}")
df = pd.read_excel(targets, sheet_name=2)
with open(output, "w+") as w:
w.write(",".join(["gene", "cdna code", "genomic code", "mutation_region", "version", "original_codon", "mutation_code"]) + "\n")
for column in df.columns:
if "Unnamed" in column:
continue
for code in df[column]:
if not isinstance(code, str) and math.isnan(code):
continue
genomic_code = res.get(column, {}).get(code)
if genomic_code:
sites, rule = decode_snp(genomic_code)
elif str(code).startswith("c."):
sites, rule = decode_snp(code, ref_start=starts["FANCD2" if column == "FAND2" else column])
else:
continue
region = Region(chroms["FANCD2" if column == "FAND2" else column], start=sites[0], end=sites[-1])
hg38 = False
if genomic_code:
region = extract_orf_sequence(fasta, [region])[0]
elif str(code).startswith("c."):
hg38 = True
region = extract_orf_sequence(fasta_hg38, [region])[0]
original, replacement = "", ""
if ">" in rule:
original, replacement = rule.split(">")
original = region.sequence
elif rule == "dup":
original = region.sequence
replacement = original * 2
elif rule == "del":
original = region.sequence
replacement = ""
elif rule == "ins":
replacement = region.sequence
elif "delins" in rule:
original = region.sequence
replacement = rule.replace("delins", "")
elif "ins" in rule:
original = region.sequence
replacement = rule.replace("ins", "")
if not genomic_code:
genomic_code = ""
# 序列中所有N替换后的排列组合
for o, r in itertools.product(generate_sequences_with_combinations(original), generate_sequences_with_combinations(replacement)):
w.write(",".join([column, code.strip(), str(genomic_code).strip(), str(region), "hg38" if hg38 else "hg19", o, r]) + "\n")
# data = pd.DataFrame(data)
# data.columns = ["gene", "cdna code", "genomic code", "mutation_region", "original_codon", "mutation_code"]
# data.to_csv(output, index = False)
def extract_fastq_seq(fastq: str, chrom, start, end):
import pysam
with pysam.FastaFile(fastq) as fh:
rec = fh.fetch(str(chrom), start, end)
# print(rec)
return rec
def decode_mutation(rule: str, sequence):
original, replacement = "", ""
if ">" in rule:
original, replacement = rule.split(">")
original = sequence
elif rule == "dup":
original = sequence
replacement = original * 2
elif rule == "del":
original = sequence
replacement = ""
elif rule == "ins":
replacement = sequence
elif "delins" in rule:
original = sequence
replacement = rule.replace("delins", "")
elif "ins" in rule:
original = sequence
replacement = rule.replace("ins", "")
return original, replacement
def design_by_hmgd(data, fasta, outfile):
import re
res = pd.read_csv(data)
# print(res.head())
# hgvs
# chromosome
# startCoord
# endCoord
data = []
for idx, row in res.iterrows():
key = row["gene"] + "_" + str(idx)
try:
seq = extract_fastq_seq(fasta, int(row["chromosome"]), row["startCoord"] - 1,row["endCoord"])
seq, replace = decode_mutation(row["hgvs"], seq)
if not seq:
continue
replace = re.sub(r"[\d_]", "", replace)
if "del" in replace:
replace = ""
print(key, seq, replace)
before = extract_fastq_seq(fasta, int(row["chromosome"]), row["startCoord"] - 1 - 100, row["startCoord"])
after = extract_fastq_seq(fasta, int(row["chromosome"]), row["endCoord"], row["endCoord"] + 100)
seq = f"{before}({seq}/{replace}){after}"
data.append({"sequence_name": key, "editseq": seq})
except Exception:
continue
data = pd.DataFrame(data)
data.to_csv(outfile, index=False)
if __name__ == "__main__":
from fire import Fire
# get_aa_coords(
# "../metainfo/Cancer and blood disorder panels_v2.xlsx",
# "../gene_coords/batch2"
# )
# get_aa_coords(
# "../metainfo/DDR gene library in 2021 Cell.xlsx",
# "../gene_coords/positive"
# )
# # Fire({"aa": design_by_aa})
# design_by_aa(
# "../gene_coords/batch2",
# fasta="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
# output="../gene_aa_target_batch2.csv.gz"
# )
# design_by_aa(
# "../gene_coords/positive",
# fasta="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
# output="../gene_aa_target_positive.csv.gz",
# stop_codon = True
# )
# run_analysis(
# "../gene_aa_target_batch2.csv.gz",
# reference="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
# outdir="../../prediction/input/batch2"
# )
# run_analysis(
# "../gene_aa_target_positive.csv.gz",
# reference="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
# outdir="../../prediction/input/positive"
# )
# 生成snp结构snp_info是整理完的snp信息
# targets是记录了需要处理的基因
# design_by_snp(
# snp_info="../metainfo/副本FA家族基因-20250829-DJJ_XD.xlsx",
# targets="../metainfo/实验计划.xlsx",
# output="gene_snp_target.csv",
# fasta="../ref/gencode/GRCh37.p13.genome.fa.gz",
# fasta_hg38="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
# genes="../gene_coords"
# )
# design_by_hmgd(
# "../metainfo/allmut.csv",
# fasta="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
# outfile="../../prediction/input/pos_v2.csv.gz"
# )
# url = "https://www.ebi.ac.uk/proteins/api/coordinates?accession=P21359-1"
# download_uniprot_region("Test", "P21359")

16
design/pyproject.toml Normal file
View File

@@ -0,0 +1,16 @@
[project]
name = "pgrna"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"biopython>=1.85",
"fire>=0.7.1",
"loguru>=0.7.3",
"openpyxl>=3.1.5",
"pandas>=2.3.3",
"pyfaidx>=0.9.0.3",
"pyliftover>=0.4.1",
"rich>=14.2.0",
]

233
design/src/editseq.py Normal file
View File

@@ -0,0 +1,233 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
created by lanzl
modified by zym
"""
import os
import pandas as pd
import re
import sys
from pyfaidx import Fasta, FetchError
HG19_FASTA_PATH = "/rawdata1/project/peRNA_design/ref/gencode/GRCh37.p13.genome.fa.gz"
HG38_FASTA_PATH = "/rawdata1/project/peRNA_design/ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
def parse_region(region_str: str) -> tuple:
"""解析 'chrom:start-end' 格式的基因组区域,并确保染色体名带有 'chr' 前缀。"""
match = re.match(r"(\w+):(\d+)-(\d+)", region_str)
chrom, start, end = match.groups()
if not chrom.lower().startswith("chr"):
chrom = "chr" + chrom
return chrom, int(start), int(end)
def extract_orf_sequence(genome: Fasta, chrom: str, start: int, end: int) -> str:
"""
从预加载的 Fasta 对象中提取序列1-based inclusive
"""
try:
sequence = str(genome.get_seq(chrom, start, end))
return sequence.upper()
except (KeyError, FetchError) as e:
alt_chrom = chrom
if chrom.lower().startswith("chr"):
alt_chrom = chrom[3:]
if alt_chrom != chrom:
try:
sequence = str(genome.get_seq(alt_chrom, start, end))
return sequence.upper()
except (KeyError, FetchError) as inner_e:
raise FetchError(
f"Requested rname '{chrom}' (also tried '{alt_chrom}') does not exist in FASTA index."
) from inner_e
raise e
def generate_editseq(
original: str,
replacement: str,
region_str: str,
genome: Fasta,
flank_size: int = 100,
) -> str:
"""构造 EditSeq 序列字符串(突变位点 + 100bp 侧翼)。"""
chrom, mut_start, mut_end = parse_region(region_str)
# 计算侧翼坐标
upstream_start = mut_start - flank_size
upstream_end = mut_start - 1
downstream_start = mut_end + 1
downstream_end = mut_end + flank_size
# 提取侧翼序列
upstream_flank = extract_orf_sequence(genome, chrom, upstream_start, upstream_end)
downstream_flank = extract_orf_sequence(
genome, chrom, downstream_start, downstream_end
)
original = str(original).strip()
replacement = str(replacement).strip()
# --- 突变逻辑:所有替换(等长或不等长)统一使用 (ORIGINAL/REPLACEMENT) 格式 ---
mut_part = ""
if original and replacement:
# 替换或 Delins: (ORIGINAL/REPLACEMENT)
mut_part = f"({original}/{replacement})"
elif original:
# 删除: (-ORIGINAL)
mut_part = f"(-{original})"
elif replacement:
# 插入: (+REPLACEMENT)
mut_part = f"(+{replacement})"
else:
mut_part = "(Invalid mutation logic)"
return f"{upstream_flank}{mut_part}{downstream_flank}"
# --- 氨基酸突变处理 ---
def process_aa_mutations(df_aa: pd.DataFrame, genome_hg38: Fasta) -> pd.DataFrame:
"""处理氨基酸AA饱和诱变数据并返回包含 EditSeq, strategy 和 mutation_type 的 DataFrame。"""
results = []
# 性能优化:使用 to_dict('records') 替代 iterrows()
for row in df_aa.to_dict("records"):
original = (
str(row["origial_code"]).strip() if pd.notna(row["origial_code"]) else ""
)
replacement = (
str(row["mutation_code"]).strip() if pd.notna(row["mutation_code"]) else ""
)
# 确定突变类型
if original and replacement:
mut_type = "REPL"
elif original and not replacement:
mut_type = "DEL"
elif not original and replacement:
mut_type = "INS"
else:
mut_type = "UNKNOWN"
# 生成序列名称
seq_name = f"{row['gene']}_AA{row['aa_index']}_{row['origial_code']}_{row['mutation_code']}"
# 生成 EditSeq
editseq = generate_editseq(
original=row["origial_code"],
replacement=row["mutation_code"],
region_str=row["aa_region"],
genome=genome_hg38,
)
# 收集结果,包括 'strategy' 和 'mutation_type' 列
results.append(
{
"sequence_name": seq_name,
"editseq": editseq,
"strategy": row["strategy"],
"mutation_type": mut_type,
}
)
return pd.DataFrame(results)
# --- SNP/cDNA 突变处理 ---
def process_snp_mutations(
df_snp: pd.DataFrame, genome_hg19: Fasta, genome_hg38: Fasta
) -> pd.DataFrame:
"""处理 SNP/cDNA 突变数据,返回包含 EditSeq 和 mutation_type 的 DataFrame。"""
results = []
# 性能优化:使用 to_dict('records') 替代 iterrows()
for row in df_snp.to_dict("records"):
original = (
str(row["original_codon"]).strip()
if pd.notna(row["original_codon"])
else ""
)
replacement = (
str(row["mutation_code"]).strip() if pd.notna(row["mutation_code"]) else ""
)
# 根据版本选择合适的 Fasta 对象
version = str(row["version"]).lower()
genome_to_use = genome_hg38 if version == "hg38" else genome_hg19
# 确定用于序列命名的突变类型
if original and replacement:
mut_type = "REPL"
elif original and not replacement:
mut_type = "DEL"
elif not original and replacement:
mut_type = "INS"
else:
mut_type = "UNKNOWN"
# 构造序列名称
cdna_code_clean = str(row["cdna code"]).replace(".", "").replace("_", "p")
seq_name = f"{row['gene']}_{mut_type}_{cdna_code_clean}"
# 生成 EditSeq
editseq = generate_editseq(
original=original,
replacement=replacement,
region_str=str(row["mutation_region"]),
genome=genome_to_use,
)
results.append(
{"sequence_name": seq_name, "editseq": editseq, "mutation_type": mut_type}
)
return pd.DataFrame(results)
def run_analysis(infile, reference, outdir):
# AA_INPUT_FILE = "/rawdata1/project/peRNA_design/gene_aa_target.csv"
# SNP_INPUT_FILE = "/rawdata1/project/peRNA_design/gene_snp_target.csv"
# genome_hg19 = Fasta(HG19_FASTA_PATH)
# genome_hg38 = Fasta(HG38_FASTA_PATH)
genome = Fasta(reference)
os.makedirs(outdir, exist_ok=True)
aa_df_input = pd.read_csv(infile)
# snp_df_input = pd.read_csv(SNP_INPUT_FILE)
# --- 阶段一: 处理 SNP/cDNA 突变并输出 ---
# snp_df = process_snp_mutations(snp_df_input, genome_hg19, genome_hg38)
# snp_output_file = "snp_editseq_output.csv"
# snp_df.to_csv(snp_output_file, index=False)
# --- 阶段二: 按 strategy 分组处理 AA 突变并输出 ---
aa_df_input["strategy"] = aa_df_input["strategy"].str.upper()
strategies = aa_df_input["strategy"].unique()
for strategy in strategies:
# 跳过空的 strategy
if pd.isna(strategy):
continue
# 过滤 DataFrame
aa_subset_df = aa_df_input[aa_df_input["strategy"] == strategy].copy()
if aa_subset_df.empty:
continue
# 处理子集
aa_df_processed = process_aa_mutations(aa_subset_df, genome)
# 保存
aa_output_file = f"aa_{strategy}_editseq_output.csv"
aa_df_processed.to_csv(os.path.join(outdir, aa_output_file), index=False)
if __name__ == "__main__":
run_analysis()

53
design/src/liftover.py Normal file
View File

@@ -0,0 +1,53 @@
import pandas as pd
from pyliftover import LiftOver
# lo = LiftOver('/home/zym/projects/pgRNA/liftover/hg19ToHg38.over.chain.gz')
from pyliftover import LiftOver
# 创建从 hg19 到 hg38 的转换器
lo = LiftOver("hg19", "hg38")
def convert_interval(chrom, start, end):
"""
将区间 (start, end) 从源基因组转换到目标基因组
返回: (new_chrom, new_start, new_end) 或 None
"""
# 注意pyliftover 使用 1-based 坐标
# 如果你的 BED 是 0-based如 BED 文件start 需要 +1
# 这里假设输入是 1-based如果是 0-based请用 start+1, end
result_start = lo.convert_coordinate(chrom, start)
result_end = lo.convert_coordinate(chrom, end)
if not result_start or not result_end:
return None # 无法转换
# 取置信度最高的映射
best_start = max(result_start, key=lambda x: x[3])
best_end = max(result_end, key=lambda x: x[3])
new_chrom = best_start[0]
new_start = best_start[1]
new_end = best_end[1]
# 确保 start <= end
if new_start >= new_end:
new_start, new_end = new_end, new_start
return new_chrom, new_start, new_end
def get_seq(path, coord):
import pysam
# 打开 FASTA 文件
fasta = pysam.FastaFile(path) # 自动读取 genome.fa.fai
# 提取序列chr1:1000000-10001000-based, [start, end)
# 注意pysam 使用 0-based 坐标,与 BED 一致
return fasta.fetch(region=f"{coord[0]}:{coord[1]}-{coord[2]}")
if __name__ == "__main__":
pass

216
design/src/mutation.py Normal file
View File

@@ -0,0 +1,216 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import itertools
from Bio.Seq import Seq
from loguru import logger
# 遗传密码表DNA -> 氨基酸
codon_table = {
"TTT": "F",
"TTC": "F",
"TTA": "L",
"TTG": "L",
"TCT": "S",
"TCC": "S",
"TCA": "S",
"TCG": "S",
"TAT": "Y",
"TAC": "Y",
"TAA": "*",
"TAG": "*",
"TGT": "C",
"TGC": "C",
"TGA": "*",
"TGG": "W",
"CTT": "L",
"CTC": "L",
"CTA": "L",
"CTG": "L",
"CCT": "P",
"CCC": "P",
"CCA": "P",
"CCG": "P",
"CAT": "H",
"CAC": "H",
"CAA": "Q",
"CAG": "Q",
"CGT": "R",
"CGC": "R",
"CGA": "R",
"CGG": "R",
"ATT": "I",
"ATC": "I",
"ATA": "I",
"ATG": "M",
"ACT": "T",
"ACC": "T",
"ACA": "T",
"ACG": "T",
"AAT": "N",
"AAC": "N",
"AAA": "K",
"AAG": "K",
"AGT": "S",
"AGC": "S",
"AGA": "R",
"AGG": "R",
"GTT": "V",
"GTC": "V",
"GTA": "V",
"GTG": "V",
"GCT": "A",
"GCC": "A",
"GCA": "A",
"GCG": "A",
"GAT": "D",
"GAC": "D",
"GAA": "E",
"GAG": "E",
"GGT": "G",
"GGC": "G",
"GGA": "G",
"GGG": "G",
}
# 反向查找:氨基酸 -> 密码子列表
aa_to_codons = {}
for codon, aa in codon_table.items():
if aa not in aa_to_codons:
aa_to_codons[aa] = []
aa_to_codons[aa].append(codon)
# 碱基
bases = ["A", "T", "G", "C"]
def generate_nnn():
"""生成所有 NNN 组合64种"""
return ["".join(c) for c in itertools.product(bases, repeat=3)]
def generate_2n_variants(original_codon, fixed_position=None):
"""
生成双N突变两个位置随机一个固定
fixed_position: 0,1,2 表示哪个位置保持不变0=第一个碱基)
若不指定则生成所有三种模式NNT, NTN, TNN
"""
variants = set()
for pos in [fixed_position] if fixed_position is not None else [0, 1, 2]:
fixed_base = original_codon[pos]
for b1 in bases:
for b2 in bases:
codon_list = ["_", "_", "_"]
codon_list[pos] = fixed_base
idx = 0
for i in range(3):
if i != pos:
codon_list[i] = [b1, b2][idx]
idx += 1
variant = "".join(codon_list)
variants.add(variant)
return sorted(variants)
def generate_1n_variants(original_codon, fixed_positions=None):
"""
生成单N突变一个位置随机两个固定
fixed_positions: 如 [0,1] 表示第0和第1位固定
若不指定则生成所有三种模式ANT, ATN, TAN
"""
variants = set()
if fixed_positions:
positions = [fixed_positions]
else:
positions = [[0, 1], [0, 2], [1, 2]]
for fix in positions:
var_pos = 3 - sum(fix) # 剩下那个位置是变量
for i in range(3):
if i not in fix:
var_pos = i
break
base1, base2 = original_codon[fix[0]], original_codon[fix[1]]
for b in bases:
codon_list = ["_", "_", "_"]
codon_list[fix[0]] = base1
codon_list[fix[1]] = base2
codon_list[var_pos] = b
variant = "".join(codon_list)
variants.add(variant)
return sorted(variants)
def translate(codon):
return codon_table.get(codon, "X")
def design_mutations_for_orf(dna_seq, strategy="3N"):
"""
对整个 ORF 序列进行饱和突变设计
strategy: '3N', '2N', '1N'
"""
if len(dna_seq) % 3 != 0:
raise ValueError(f"ORF 长度必须是 3 的倍数!{dna_seq}")
num_codons = len(dna_seq) // 3
results = []
for i in range(num_codons):
start = i * 3
end = start + 3
orig_codon = dna_seq[start:end]
orig_aa = translate(orig_codon)
logger.debug(
f"\n--- 位点 {i + 1} (氨基酸 {i + 1}): {orig_aa} ({orig_codon}) ---"
)
variants = []
if strategy == "3N":
variants = generate_nnn()
logger.debug(f"策略: 3N (NNN) → 共 {len(variants)} 种组合")
elif strategy == "2N":
variants = generate_2n_variants(orig_codon)
logger.debug(f"策略: 2N (任意两个随机) → 共 {len(variants)} 种组合")
elif strategy == "1N":
variants = generate_1n_variants(orig_codon)
logger.debug(f"策略: 1N (任意一个随机) → 共 {len(variants)} 种组合")
else:
raise ValueError("strategy 必须是 '3N', '2N', 或 '1N'")
# 过滤掉无效密码子(理论上不会)
valid_variants = [v for v in variants if len(v) == 3]
# 统计突变结果
mutant_aa_count = {}
stop_count = 0
for v in valid_variants:
aa = translate(v)
if aa == "*":
stop_count += 1
mutant_aa_count[aa] = mutant_aa_count.get(aa, 0) + 1
logger.debug(f"→ 共产生 {len(valid_variants)} 个有效突变")
logger.debug(f"→ 可产生 {len(mutant_aa_count)} 种不同氨基酸(含终止)")
logger.debug(f"→ 引入终止密码子: {stop_count}")
logger.debug(f"→ 氨基酸分布: {mutant_aa_count}")
results.append(
{
"position": i + 1,
"original_codon": orig_codon,
"original_aa": orig_aa,
"variants": valid_variants,
"variant_count": len(valid_variants),
"mutant_aa_count": mutant_aa_count,
"stop_count": stop_count,
}
)
return results
if __name__ == "__main__":
pass

236
design/src/reader.py Normal file
View File

@@ -0,0 +1,236 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
从gtf中读取CDS并读取对应的sequence
"""
import gzip
import pandas as pd
from Bio.Seq import Seq
from loguru import logger
from pyfaidx import Fasta
class Region(object):
"""记录坐标位点,用于融合"""
def __init__(self, chrom, start, end, strand="+", kind=None):
self.chrom = chrom
self.start = start
self.end = end
self.strand = strand
self.sequence = None
self.kind_ = kind
self.addition = None
@classmethod
def create(cls, region):
chrom, sites = region.split(":")
sites = [int(x) for x in sites.split("-")]
return cls(chrom, sites[0], sites[-1], "+")
def set_seq(self, sequence: str):
self.sequence = sequence.upper()
# 如果是负链,需要反向互补
if self.strand == "-":
self.sequence = str(Seq(sequence).reverse_complement())
def __and__(self, other):
if self.chrom != other.chrom:
return False
return self.start < other.end and self.end > other.start
def __add__(self, other):
if not self & other:
raise ValueError("没有重合位点")
self.start = min(self.start, other.start)
self.end = max(self.end, other.end)
return self
def __str__(self) -> str:
return f"{self.chrom}:{self.start}-{self.end}"
def __hash__(self):
return hash(str(self))
def __len__(self):
return self.end - self.start
@property
def kind(self):
if len(self) >= 3:
return self.kind_
if not self.kind_:
return ""
else:
return f"cross_{self.kind_}"
def read_gtf(gtf_path):
"""
读取 GTF 文件,返回 DataFrame
"""
logger.info("正在读取 GTF 文件...")
columns = [
"seqname",
"source",
"feature",
"start",
"end",
"score",
"strand",
"frame",
"attribute",
]
df = pd.read_csv(
gtf_path, sep="\t", comment="#", header=None, names=columns, low_memory=False
)
# 过滤出 CDS 行
cds_df = df[df["feature"] == "CDS"].copy()
# 解析 attribute 列,展开为多个列
# 使用 pd.json_normalize 将字典列表转换为 DataFrame
try:
attributes_df = pd.json_normalize(cds_df["attribute"].apply(parse_attributes))
except Exception as e:
logger.error(f"解析 attribute 字段失败: {e}")
raise
# 将原始列与解析后的属性列合并
result_df = pd.concat([cds_df.reset_index(drop=True), attributes_df], axis=1)
logger.info(f"成功读取并解析 GTF 文件,共 {len(result_df)} 个 CDS 特征。")
return result_df
def parse_attributes(attr_str):
"""
解析 GTF 的 attribute 字段,返回字典
"""
attributes = {}
for item in attr_str.split(";"):
item = item.strip()
if not item:
continue
if " " in item:
key, value = item.split(" ", 1)
attributes[key] = value.strip('"')
return attributes
def get_cds_for_gene(cds_df, gene_name):
"""
提取指定基因的所有 CDS 条目,并按转录本分组,选择最长的转录本
"""
logger.info(f"正在查找基因 '{gene_name}' 的 CDS...")
# 添加解析后的属性
cds_df["attributes_parsed"] = cds_df["attribute"].apply(parse_attributes)
# 筛选包含该基因名的行
gene_cds_list = []
for idx, row in cds_df.iterrows():
attrs = row["attributes_parsed"]
if attrs.get("transcript_id") == gene_name:
# if attrs.get('gene_name') == gene_name or attrs.get('gene_id').startswith(gene_name):
gene_cds_list.append(row)
if not gene_cds_list:
raise ValueError(f"未在 GTF 中找到基因 '{gene_name}'")
df = pd.DataFrame(gene_cds_list)
df = df[
["seqname", "feature", "start", "end", "strand", "transcript_id"]
].drop_duplicates()
res = []
last = None
for _, row in df.iterrows():
temp = Region(
str(row["seqname"]),
row["start"],
row["end"],
str(row["strand"]),
row["transcript_id"],
)
if last is None:
last = temp
elif temp & last:
last = last + temp
else:
res.append(last)
last = temp
if last not in res:
res.append(last)
return res
def load_uniprot_region(path):
res = []
last = None
with open(path) as r:
for line in r:
if line.startswith("#"):
continue
temp = Region.create(line.split()[0])
if last is None:
last = temp
elif temp & last:
last = last + temp
else:
res.append(last)
last = temp
if last not in res:
res.append(last)
return res
def extract_orf_sequence(genome_fasta, cds_rows, half_open=False):
"""
从参考基因组中提取 CDS 并拼接成 ORF
"""
if not cds_rows:
raise ValueError("not cds")
seqname = cds_rows[0].chrom
strand = cds_rows[0].strand
logger.debug(f"从参考基因组提取序列 (chr{seqname})...")
genome = Fasta(genome_fasta)
# 获取染色体序列
try:
chrom_seq = genome[seqname] # 如 "chr1", "1" 等,根据 FASTA 命名调整
except KeyError:
if "chr" in seqname:
seqname = seqname.replace("chr", "")
else:
seqname = "chr" + seqname
chrom_seq = genome[seqname] # 如 "chr1", "1" 等,根据 FASTA 命名调整
for row in cds_rows:
start = int(row.start) - 1 # GTF 是 1-basedpyfaidx 是 0-based
end = int(row.end) - (1 if half_open else 0)
if len(row) < 3 and "cross" in row.kind:
if row.kind == "cross_start":
start = end - 3
else:
end = start + 3
row.set_seq(chrom_seq[start:end].seq)
return cds_rows
if __name__ == "__main__":
pass

139
design/src/safe_target.py Normal file
View File

@@ -0,0 +1,139 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
生成safe targeting 序列
"""
import pandas as pd
import random
import pysam
from tqdm import tqdm
seed = 42
random.seed(42)
__AAs__ = {
"丙氨酸": ["GCU", "GCC", "GCA", "GCG"],
"精氨酸": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"],
"天冬酰胺": ["AAU", "AAC"],
"天冬氨酸": ["GAU", "GAC"],
"半胱氨酸": ["UGU", "UGC"],
"谷氨酰胺": ["CAA", "CAG"],
"谷氨酸": ["GAA", "GAG"],
"甘氨酸": ["GGU", "GGC", "GGA", "GGG"],
"组氨酸": ["CAU", "CAC"],
"异亮氨酸": ["AUU", "AUC", "AUA"],
"亮氨酸": ["UUA", "UUG", "CUU", "CUC", "CUA", "CUG"],
"赖氨酸": ["AAA", "AAG"],
"甲硫氨酸": ["AUG"],
"苯丙氨酸": ["UUU", "UUC"],
"脯氨酸": ["CCU", "CCC", "CCA", "CCG"],
"丝氨酸": ["UCU", "UCC", "UCA", "UCG", "AGU", "AGC"],
"苏氨酸": ["ACU", "ACC", "ACA", "ACG"],
"色氨酸": ["UGG"],
"酪氨酸": ["UAU", "UAC"],
"缬氨酸": ["GUU", "GUC", "GUA", "GUG"],
"终止密码子": ["UAA", "UAG", "UGA"],
}
def codons():
for key, values in __AAs__.items():
yield key, [value.replace("U", "T") for value in values]
class Region:
"""Represents a 3bp codon region in the full sequence."""
def __init__(self, chrom: str, start: int, end: int):
self.chrom = chrom
self.start = start
self.end = end
self.__shift__ = 0
def __str__(self):
return f"{self.chrom}:{self.start}-{self.end}"
def shift(self, fasta: str):
for i in range(0, self.end - self.start):
if self.__shift__ != 0:
break
seq = extract_fastq_seq(fasta, Region(self.chrom, self.start+i, self.start+i+3))
for _, values in codons():
if seq in values:
self.__shift__ = i
break
def choose(self, number: int = 3):
length_of_codon = 3
regions = []
for i in range(self.start + self.__shift__, self.end, length_of_codon):
if i + length_of_codon > self.end:
break
regions.append([i, i + length_of_codon])
# np.choice(my_list, size=3, replace=False)
if number > len(regions):
return [Region(self.chrom, x[0], x[1]) for x in regions]
return [Region(self.chrom, x[0], x[1]) for x in random.sample(regions, number)]
def extract_fastq_seq(fastq: str, region: Region, seq_len: int = 100):
with pysam.FastaFile(fastq) as fh:
rec = fh.fetch(region.chrom, region.start, region.end)
# print(rec)
return rec
def mutation(seq: str):
random.seed(seed)
for key, value in __AAs__.items():
if seq in value:
random_keys = random.sample([x for x in __AAs__.keys() if x != key], 1)[0]
return random.sample(__AAs__[random_keys], 1)[0].replace("U", "T")
def main(infile, outfile, reference = "../ref/UCSC/hg19.fa.gz", seq_len: int = 100):
meta = pd.read_excel(infile, sheet_name="Human Safe Regions", header=None)
meta = meta.sample(n=2000, random_state=seed)
data = []
for idx in tqdm(meta.iloc[:, 0], total=meta.shape[0]):
idx = idx.split(";")
region = Region(idx[0], int(idx[1]), int(idx[2]))
region.shift(reference)
regions = region.choose(5)
for reg in regions:
seq = extract_fastq_seq(reference, reg)
mut = mutation(seq)
if seq is None or mut is None:
continue
key = str(reg) + "_" + seq + "_" + mut
before = extract_fastq_seq(reference, Region(region.chrom, reg.start - seq_len, reg.start))
after = extract_fastq_seq(reference, Region(region.chrom, reg.end, reg.end + seq_len))
seq = f"{before}({seq}/{mut}){after}"
data.append({"sequence_name": key, "editseq": seq})
data = pd.DataFrame(data)
data.to_csv(outfile, index=False)
if __name__ == "__main__":
from fire import Fire
Fire(main)

114
design/src/snp.py Normal file
View File

@@ -0,0 +1,114 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""用来解析snp错配信息"""
import re
from itertools import product
def generate_sequences_with_combinations(seq):
"""
将 DNA 序列中连续的 N 替换为所有可能的 A/T/C/G 组合,
返回所有可能的序列列表。
参数:
seq (str): 输入的 DNA 序列,可包含 N
返回:
list: 所有可能的序列(字符串列表)
"""
if "N" not in seq:
return [seq]
# 分割序列,保留分隔符信息
segments = []
i = 0
while i < len(seq):
if seq[i] == "N":
j = i
while j < len(seq) and seq[j] == "N":
j += 1
length = j - i
segments.append(("N", length)) # ('N', 3) 表示连续3个N
i = j
else:
j = i
while j < len(seq) and seq[j] != "N":
j += 1
segments.append(("seq", seq[i:j]))
i = j
# 提取每个 N 块的可能组合
n_block_options = []
for seg_type, content in segments:
if seg_type == "N":
# 生成所有长度为 content 的 ATCG 组合
options = ["".join(p) for p in product("ATCG", repeat=content)]
n_block_options.append(options)
# 如果没有 N直接返回原序列
if not n_block_options:
return [seq]
# 使用 itertools.product 生成所有组合
from itertools import product as iter_product
all_combinations = list(iter_product(*n_block_options))
# 构建所有可能的序列
results = []
for combo in all_combinations:
new_seq = ""
n_index = 0
for seg_type, content in segments:
if seg_type == "seq":
new_seq += content
elif seg_type == "N":
new_seq += combo[n_index]
n_index += 1
results.append(new_seq)
return results
def decode_snp(label, ref_start=0):
if label is None:
return ""
if ":" in label:
label = label.split(":")[-1]
if ref_start <= 0 and not label.startswith("g."):
raise ValueError(f"{label} not genomic label")
elif ref_start > 0 and not label.startswith("c."):
raise ValueError(f"{label} not cdna label")
label = re.sub(r"([cg]\.|\[\d+\])", "", label)
sites = []
for x in label.split("_"):
if not x:
continue
x = re.sub(r"[^\d\+-]", "", x)
if "+" in x:
x = [int(y) for y in x.split("+")]
x = x[0] + x[-1]
elif "-" in x:
x = [int(y) for y in x.split("-")]
x = x[0] + x[-1]
else:
x = int(x)
sites.append(x + ref_start)
sites = sorted(sites)
rule = re.sub(r"[\d_\+-]", "", label)
return sites, rule.strip()
if __name__ == "__main__":
pass

263
design/src/snv-N-2N-3N.py Normal file
View File

@@ -0,0 +1,263 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import re
import sys
import itertools
import random
import gzip
from typing import List, Dict, Any, Optional
import pandas as pd
from loguru import logger
# ps
# 将目标序列按3bp密码子分割每个密码子系统性地生成三类突变
# 为每个突变自动附加100bp的上下游侧翼序列每类随机抽取150个突变
# Mutation design constants
NUCLEOTIDES = ["A", "T", "C", "G"]
UPSTREAM_LEN = 100
DOWNSTREAM_LEN = 100
TARGET_MUTATIONS = 150
class Region:
"""Represents a 3bp codon region in the full sequence."""
def __init__(
self, chrom: str, start: int, end: int, sequence: str, absolute_index: int
):
self.chrom = chrom
self.start = start
self.end = end
self.sequence = sequence.upper()
self.absolute_index = absolute_index
def read_fasta(fasta_path: str) -> Dict[str, str]:
"""Parses FASTA file, returning {header: sequence}."""
sequences = {}
current_header: Optional[str] = None
opener = gzip.open if fasta_path.endswith(".gz") else open
if not os.path.exists(fasta_path):
logger.error(f"FASTA file not found: {fasta_path}")
return {}
with opener(fasta_path, "rt") as f:
current_seq: List[str] = []
for line in f:
line = line.strip()
if not line:
continue
if line.startswith(">"):
if current_header and current_seq:
sequences[current_header] = (
"".join(current_seq).upper().replace("U", "T")
)
current_header = line[1:].split()[0]
current_seq = []
else:
if current_header:
current_seq.append(line)
if current_header and current_seq:
sequences[current_header] = "".join(current_seq).upper().replace("U", "T")
return sequences
def split_sequence_to_codons(full_seq: str, gene_name: str) -> List[Region]:
"""Splits full sequence into 3bp Region objects."""
regions: List[Region] = []
for i in range(0, len(full_seq), 3):
codon = full_seq[i : i + 3]
if len(codon) == 3:
regions.append(Region(gene_name, i, i + 2, codon, i))
return regions
def analyze_variant(ref: str, alt: str) -> str:
"""Simplifies substitution representation (e.g., 'CAT'->'CGT' becomes 'C(A/G)T')."""
if len(ref) != len(alt):
return f"({ref}/{alt})"
diffs = [
{"index": i, "ref_base": ref[i], "alt_base": alt[i]}
for i in range(len(ref))
if ref[i] != alt[i]
]
if not diffs:
return ref
positions = [d["index"] for d in diffs]
is_consecutive = all(
positions[i + 1] - positions[i] == 1 for i in range(len(positions) - 1)
)
# Rule 1: Continuous differences (e.g., 'CAT'->'CGC' becomes 'C(AT/GC)')
if is_consecutive:
start_pos, end_pos = positions[0], positions[-1] + 1
return f"{ref[:start_pos]}({ref[start_pos:end_pos]}/{alt[start_pos:end_pos]}){ref[end_pos:]}"
# Rule 2: Intermittent differences (e.g., 'GTT'->'GCG' becomes 'G(T/C)(T/G)')
out = []
prev_end = 0
for d in diffs:
pos, r, a = d["index"], d["ref_base"], d["alt_base"]
out.append(ref[prev_end:pos])
out.append(f"({r}/{a})")
prev_end = pos + 1
out.append(ref[prev_end:])
return "".join(out)
def generate_codon_mutations(original_codon: str, n_mutations: int) -> List[str]:
"""Generates all codon variants with exactly n_mutations."""
mutants = set()
codon_length = len(original_codon)
for indices in itertools.combinations(range(codon_length), n_mutations):
base_options: List[List[str]] = []
for i in range(codon_length):
if i in indices:
options = [b for b in NUCLEOTIDES if b != original_codon[i]]
else:
options = [original_codon[i]]
base_options.append(options)
for combination in itertools.product(*base_options):
mutant_codon = "".join(combination)
if mutant_codon != original_codon:
mutants.add(mutant_codon)
return sorted(list(mutants))
def generate_editseq_and_metadata(
full_seq: str, regions: List[Region], gene_name: str
) -> pd.DataFrame:
"""Generates all mutations (1N, 2N, 3N) and constructs the final DataFrame."""
df_list: List[Dict[str, str]] = []
for idx, x in enumerate(regions):
abs_start = x.absolute_index
original_codon = x.sequence
# 1. Extract flanking sequences
flank_up = full_seq[max(0, abs_start - UPSTREAM_LEN) : abs_start]
flank_down = full_seq[
abs_start + 3 : min(len(full_seq), abs_start + 3 + DOWNSTREAM_LEN)
]
for strategy, n_mut in [("3N", 3), ("2N", 2), ("1N", 1)]:
variants = generate_codon_mutations(original_codon, n_mut)
for mutation_codon in variants:
# Use analyze_variant to simplify representation
simplified_codon = analyze_variant(original_codon, mutation_codon)
# sequence_name: GENE_SUB_STRATEGY_AAINDEX_ORIGINAL>MUTATION
seq_name = f"{gene_name}_SUB_{strategy}_AA{idx + 1}_{original_codon}>{mutation_codon}"
# editseq: flank_up + simplified_codon + flank_down
edit_seq = f"{flank_up}{simplified_codon}{flank_down}"
df_list.append(
{
"sequence_name": seq_name,
"editseq": edit_seq,
"strategy": strategy,
"mutation_type": "REPL", # Replacement
}
)
return pd.DataFrame(df_list)
def run_mutation_design(fasta_file: str, gene_name: str, output_base_name: str):
"""Executes the mutation design pipeline and saves 3 separate files."""
logger.info(f"Targeting gene: {gene_name}")
fasta_data = read_fasta(fasta_file)
full_seq, target_id = "", ""
# Locate target sequence
for seq_id, seq in fasta_data.items():
if gene_name.upper() in seq_id.upper():
full_seq = seq
target_id = seq_id
break
if not full_seq and fasta_data:
# Fallback: use longest sequence
target_id, full_seq = max(fasta_data.items(), key=lambda item: len(item[1]))
if full_seq:
logger.warning(
f"Using longest sequence ID: {target_id} (Length: {len(full_seq)} bp)"
)
if not full_seq:
logger.error(f"Failed to extract target sequence.")
return
logger.info(f"Target sequence ID: {target_id}, Length: {len(full_seq)} bp")
# 1. Generate ALL mutations (1N, 2N, 3N)
cds_regions = split_sequence_to_codons(full_seq, gene_name)
all_mutations_df = generate_editseq_and_metadata(full_seq, cds_regions, gene_name)
# 2. Process and save
strategies = ["1N", "2N", "3N"]
for strategy in strategies:
# Filter for the current strategy
strategy_df = all_mutations_df[all_mutations_df["strategy"] == strategy].copy()
original_count = len(strategy_df)
# Determine output file name (e.g., AAVS1_1N_150_mutations.csv)
output_file_name = output_base_name.replace("{strategy}", strategy)
if original_count == 0:
logger.warning(
f"Strategy {strategy}: No mutations generated. Skipping file creation for {output_file_name}."
)
continue
# Random sampling for the current strategy
if original_count > TARGET_MUTATIONS:
final_df = strategy_df.sample(n=TARGET_MUTATIONS, random_state=42)
logger.success(
f"Strategy {strategy}: Sampled {TARGET_MUTATIONS} mutations from {original_count} designs."
)
else:
final_df = strategy_df
logger.warning(
f"Strategy {strategy}: Generated {original_count} mutations; saving all."
)
# Save result, ensuring column order
final_df[["sequence_name", "editseq", "strategy", "mutation_type"]].to_csv(
output_file_name, index=False
)
logger.success(f"Strategy {strategy}: Design saved to {output_file_name}.")
if __name__ == "__main__":
AAVS1_FASTA_PATH = (
"/rawdata1/project/peRNA_design/ref/AAVS1/ncbi_dataset/data/rna.fna"
)
GENE_NAME = "AAVS1"
OUTPUT_BASE_NAME = "AAVS1_{strategy}_150_mutations.csv"
run_mutation_design(
fasta_file=AAVS1_FASTA_PATH,
gene_name=GENE_NAME,
output_base_name=OUTPUT_BASE_NAME,
)