提交饱和编辑的相关设计,及检验代码
This commit is contained in:
450
design/main.py
Normal file
450
design/main.py
Normal file
@@ -0,0 +1,450 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from glob import glob
|
||||
import requests as rq
|
||||
import pandas as pd
|
||||
from loguru import logger
|
||||
import numpy as np
|
||||
from src.mutation import design_mutations_for_orf
|
||||
from src.reader import (extract_orf_sequence, get_cds_for_gene,
|
||||
load_uniprot_region, read_gtf, Region)
|
||||
from src.liftover import convert_interval
|
||||
from src.snp import decode_snp, generate_sequences_with_combinations
|
||||
import itertools
|
||||
from src.editseq import run_analysis
|
||||
|
||||
|
||||
# 清除默认的 handler
|
||||
logger.remove()
|
||||
|
||||
# 添加一个只输出 INFO 及以上级别日志的 sink(如控制台)
|
||||
# logger.add(level="INFO")
|
||||
logger.add(
|
||||
sys.stderr,
|
||||
colorize=True,
|
||||
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>",
|
||||
# <cyan>{name}</cyan>: <cyan>{function}</cyan>: <cyan>{line}</cyan>
|
||||
level="INFO"
|
||||
)
|
||||
|
||||
|
||||
def split_regions(cds):
|
||||
u"""
|
||||
切分原本的cds为3bp的氨基酸reigon
|
||||
测试用例
|
||||
14:103698801-103699017
|
||||
14:103699133-103699179
|
||||
14:103699364-103699576
|
||||
14:103703173-103703327
|
||||
14:103707003-103707215
|
||||
14:103708522-103708659
|
||||
14:103711033-103711087
|
||||
"""
|
||||
regions = []
|
||||
cds = sorted(cds, key=lambda x: (x.chrom, x.start, x.end))
|
||||
aa_codon_len = 3
|
||||
start = 0
|
||||
|
||||
for x in cds:
|
||||
# 如果start为0,则直接从目前的区域开始
|
||||
if start == 0:
|
||||
start = x.start
|
||||
elif start < 0:
|
||||
# 如果start为负值,说明上一个cds并不能完整划分为不同的aa,
|
||||
# 因此,需要单独将起始的小区域单独写出来
|
||||
regions.append(Region(x.chrom, x.start, x.start - start, kind="start"))
|
||||
regions[-1].addition = x
|
||||
start = x.start - start
|
||||
|
||||
while start + aa_codon_len <= x.end:
|
||||
# 记录下是否跨边界,以及跨的是哪一个边界
|
||||
code = "regular"
|
||||
if start == x.start:
|
||||
code = "start"
|
||||
elif start + aa_codon_len == x.end:
|
||||
code = "end"
|
||||
|
||||
regions.append(Region(x.chrom, start, start + aa_codon_len, kind=code))
|
||||
regions[-1].addition = x
|
||||
start += aa_codon_len
|
||||
|
||||
if start < x.end:
|
||||
# 如果是跨到end边界上了,那么就记录跨的边界
|
||||
regions.append(Region(x.chrom, start, x.end, kind="end"))
|
||||
regions[-1].addition = x
|
||||
start = start - x.end + 1
|
||||
else:
|
||||
# 如果没有,则把start的指针归零
|
||||
start = 0
|
||||
|
||||
return regions
|
||||
|
||||
|
||||
def download_uniprot_region(protein, output):
|
||||
resp = output.replace(".tsv", ".json")
|
||||
url = f"https://www.ebi.ac.uk/proteins/api/coordinates?accession={protein}"
|
||||
|
||||
if os.path.exists(resp):
|
||||
with open(resp, "r") as r:
|
||||
resp = json.load(r)
|
||||
else:
|
||||
resp = rq.get(url, headers={"Accept": "application/json"})
|
||||
resp = resp.json()
|
||||
|
||||
with open(output.replace(".tsv", ".json"), "w+") as w:
|
||||
json.dump(resp, w, indent=4)
|
||||
|
||||
if not resp[0]["name"].endswith("HUMAN"):
|
||||
raise ValueError(f"protein is not human")
|
||||
|
||||
__chroms__ = [str(x) for x in range(1, 23)] + ["chr" + str(x) for x in range(1, 23)] + ["X", "Y", "chrX", "chrY"]
|
||||
|
||||
with open(output, "w+") as w:
|
||||
w.write(f"#{url}\n")
|
||||
for coord in resp[0]["gnCoordinate"]:
|
||||
chromosome = coord["genomicLocation"]["chromosome"]
|
||||
|
||||
if chromosome not in __chroms__:
|
||||
continue
|
||||
|
||||
for row in coord["genomicLocation"]["exon"]:
|
||||
genome = row["genomeLocation"]
|
||||
genome = str(genome["begin"]["position"]) + "-" + str(genome["end"]["position"])
|
||||
|
||||
protein = row["proteinLocation"]
|
||||
|
||||
if "end" not in protein and "position" in protein:
|
||||
protein = [str(protein["position"]["position"]), "-", str(protein["position"]["position"])]
|
||||
else:
|
||||
protein = [str(protein["begin"]["position"]), "-", str(protein["end"]["position"])]
|
||||
|
||||
row = f"{chromosome}:{genome}\t{'\t'.join(protein)}"
|
||||
w.write(row + "\n")
|
||||
|
||||
break
|
||||
|
||||
|
||||
def get_aa_coords(genes, output):
|
||||
os.makedirs(output, exist_ok=True)
|
||||
df = pd.read_excel(genes)
|
||||
# df = df.loc[df["Batch"] == 1, :]
|
||||
|
||||
for _, row in df.iterrows():
|
||||
gene_name = row[1]
|
||||
|
||||
url = f"https://rest.uniprot.org/uniprotkb/search?query=gene:{gene_name}+AND+organism_id:9606+AND+reviewed:true&format=json"
|
||||
resp = rq.get(url)
|
||||
|
||||
for row in resp.json().get("results", []):
|
||||
if "HUMAN" in row["uniProtkbId"]:
|
||||
priority = row["primaryAccession"]
|
||||
download_uniprot_region(priority, os.path.join(output, f"{gene_name}_{priority}.tsv"))
|
||||
break
|
||||
|
||||
|
||||
|
||||
def adjust_cross_border_region(row):
|
||||
start = row.start
|
||||
end = row.end
|
||||
if len(row) < 3 and "cross" in row.kind:
|
||||
if row.kind == "cross_start":
|
||||
start = end - 3
|
||||
else:
|
||||
end = start + 3
|
||||
return f"{row.chrom}:{start}-{end}"
|
||||
return str(row)
|
||||
|
||||
|
||||
def design_by_aa(genes, fasta, output, stop_codon = False):
|
||||
u""" 根据氨基酸设计错配的位点和错配规则 """
|
||||
df = []
|
||||
for gene in glob(os.path.join(genes, "*.tsv")):
|
||||
logger.info(f"开始设计突变 {gene}...")
|
||||
key = os.path.basename(gene).split(".")[0]
|
||||
|
||||
# 读取已有的区域
|
||||
cds = load_uniprot_region(gene)
|
||||
|
||||
# 按照氨基酸位置划分
|
||||
cds = split_regions(cds)
|
||||
|
||||
if not cds:
|
||||
continue
|
||||
|
||||
# 提取序列
|
||||
cds = extract_orf_sequence(fasta, cds, half_open=True)
|
||||
|
||||
for idx, x in enumerate(cds):
|
||||
for strategy in ["3N"]:
|
||||
results = design_mutations_for_orf(x.sequence, strategy=strategy)
|
||||
for res in results:
|
||||
for var in res["variants"]:
|
||||
if var == res["original_codon"]:
|
||||
continue
|
||||
|
||||
# 如果1个bp在起点,则说明2bp在前边end,该位点的前2bp必须与记录的内含子相同
|
||||
if "cross_start" == x.kind and len(x) == 1 and var[:2] != x.sequence[:2]:
|
||||
continue
|
||||
|
||||
# 如果2bp在起点,则说明1bp在前边end,则该位点的第一个碱基必须为内含子相同位点
|
||||
elif "cross_start" == x.kind and len(x) == 2 and var[0] != x.sequence[0]:
|
||||
continue
|
||||
|
||||
# 如果1bp在end,则说明2bp在后边,则该位点的2bp位点必须与内含子相同
|
||||
elif "cross_end" == x.kind and len(x) == 1 and var[1:] != x.sequence[1:]:
|
||||
continue
|
||||
|
||||
# 如果1bp在end,则说明1bp在后边,则该位点的1bp位点必须为C或G
|
||||
elif "cross_end" == x.kind and len(x) == 2 and var[-1] not in ["C", "G"]:
|
||||
continue
|
||||
|
||||
row = [key, str(x.addition), idx+1, str(x), adjust_cross_border_region(x), x.kind, strategy, res["original_codon"], var]
|
||||
df.append(row)
|
||||
|
||||
df = pd.DataFrame(df)
|
||||
df.columns = ["gene", "cds_region", "aa_index", "aa_region", "region_with_intron", "cross_cds_border", "strategy",
|
||||
"origial_code", "mutation_code"]
|
||||
strategy = []
|
||||
for _, row in df.iterrows():
|
||||
match = np.sum([x == y for x, y in zip(row["origial_code"], row["mutation_code"])])
|
||||
strategy.append(f"{3-match}N")
|
||||
|
||||
df["strategy"] = strategy
|
||||
|
||||
if stop_codon:
|
||||
df = df[df["mutation_code"].isin(["TAA", "TAG", "TGA"])]
|
||||
|
||||
df.to_csv(output, index = False)
|
||||
|
||||
|
||||
def design_by_snp(snp_info, targets, genes, fasta, fasta_hg38, output):
|
||||
logger.info("读取染色体")
|
||||
chroms = {}
|
||||
starts = {}
|
||||
for gene in glob(os.path.join(genes, "*.tsv")):
|
||||
key = os.path.basename(gene).split(".")[0]
|
||||
cds = load_uniprot_region(gene)
|
||||
cds = sorted(cds, key=lambda x:[x.chrom, x.start, x.end])
|
||||
chroms[key] = cds[0].chrom
|
||||
starts[key] = cds[0].start
|
||||
|
||||
logger.info(f"读取snp的信息:{snp_info}")
|
||||
all_sheets = pd.read_excel(snp_info, sheet_name=None)
|
||||
|
||||
# 遍历所有工作表
|
||||
res = {}
|
||||
for sheet_name, df in all_sheets.items():
|
||||
temp = {}
|
||||
for _, row in df.iterrows():
|
||||
cdna = row["DNA change (cDNA) "]
|
||||
hg38 = row["DNA change (genomic) (hg19) "]
|
||||
temp[cdna] = hg38
|
||||
for sheet in re.split(r"[\((\s\))]", sheet_name):
|
||||
res[sheet] = temp
|
||||
|
||||
print(res.keys())
|
||||
|
||||
logger.info(f"读取目标:{targets}")
|
||||
df = pd.read_excel(targets, sheet_name=2)
|
||||
|
||||
with open(output, "w+") as w:
|
||||
w.write(",".join(["gene", "cdna code", "genomic code", "mutation_region", "version", "original_codon", "mutation_code"]) + "\n")
|
||||
for column in df.columns:
|
||||
if "Unnamed" in column:
|
||||
continue
|
||||
|
||||
for code in df[column]:
|
||||
if not isinstance(code, str) and math.isnan(code):
|
||||
continue
|
||||
|
||||
genomic_code = res.get(column, {}).get(code)
|
||||
|
||||
if genomic_code:
|
||||
sites, rule = decode_snp(genomic_code)
|
||||
elif str(code).startswith("c."):
|
||||
sites, rule = decode_snp(code, ref_start=starts["FANCD2" if column == "FAND2" else column])
|
||||
else:
|
||||
continue
|
||||
|
||||
region = Region(chroms["FANCD2" if column == "FAND2" else column], start=sites[0], end=sites[-1])
|
||||
|
||||
hg38 = False
|
||||
if genomic_code:
|
||||
region = extract_orf_sequence(fasta, [region])[0]
|
||||
elif str(code).startswith("c."):
|
||||
hg38 = True
|
||||
region = extract_orf_sequence(fasta_hg38, [region])[0]
|
||||
|
||||
original, replacement = "", ""
|
||||
if ">" in rule:
|
||||
original, replacement = rule.split(">")
|
||||
original = region.sequence
|
||||
elif rule == "dup":
|
||||
original = region.sequence
|
||||
replacement = original * 2
|
||||
elif rule == "del":
|
||||
original = region.sequence
|
||||
replacement = ""
|
||||
elif rule == "ins":
|
||||
replacement = region.sequence
|
||||
elif "delins" in rule:
|
||||
original = region.sequence
|
||||
replacement = rule.replace("delins", "")
|
||||
elif "ins" in rule:
|
||||
original = region.sequence
|
||||
replacement = rule.replace("ins", "")
|
||||
|
||||
if not genomic_code:
|
||||
genomic_code = ""
|
||||
|
||||
# 序列中所有N替换后的排列组合
|
||||
for o, r in itertools.product(generate_sequences_with_combinations(original), generate_sequences_with_combinations(replacement)):
|
||||
w.write(",".join([column, code.strip(), str(genomic_code).strip(), str(region), "hg38" if hg38 else "hg19", o, r]) + "\n")
|
||||
|
||||
# data = pd.DataFrame(data)
|
||||
# data.columns = ["gene", "cdna code", "genomic code", "mutation_region", "original_codon", "mutation_code"]
|
||||
# data.to_csv(output, index = False)
|
||||
|
||||
|
||||
|
||||
def extract_fastq_seq(fastq: str, chrom, start, end):
|
||||
import pysam
|
||||
with pysam.FastaFile(fastq) as fh:
|
||||
rec = fh.fetch(str(chrom), start, end)
|
||||
# print(rec)
|
||||
return rec
|
||||
|
||||
|
||||
def decode_mutation(rule: str, sequence):
|
||||
original, replacement = "", ""
|
||||
if ">" in rule:
|
||||
original, replacement = rule.split(">")
|
||||
original = sequence
|
||||
elif rule == "dup":
|
||||
original = sequence
|
||||
replacement = original * 2
|
||||
elif rule == "del":
|
||||
original = sequence
|
||||
replacement = ""
|
||||
elif rule == "ins":
|
||||
replacement = sequence
|
||||
elif "delins" in rule:
|
||||
original = sequence
|
||||
replacement = rule.replace("delins", "")
|
||||
elif "ins" in rule:
|
||||
original = sequence
|
||||
replacement = rule.replace("ins", "")
|
||||
return original, replacement
|
||||
|
||||
|
||||
def design_by_hmgd(data, fasta, outfile):
|
||||
import re
|
||||
res = pd.read_csv(data)
|
||||
# print(res.head())
|
||||
|
||||
# hgvs
|
||||
# chromosome
|
||||
# startCoord
|
||||
# endCoord
|
||||
|
||||
data = []
|
||||
for idx, row in res.iterrows():
|
||||
|
||||
key = row["gene"] + "_" + str(idx)
|
||||
|
||||
try:
|
||||
seq = extract_fastq_seq(fasta, int(row["chromosome"]), row["startCoord"] - 1,row["endCoord"])
|
||||
|
||||
seq, replace = decode_mutation(row["hgvs"], seq)
|
||||
|
||||
if not seq:
|
||||
continue
|
||||
replace = re.sub(r"[\d_]", "", replace)
|
||||
|
||||
if "del" in replace:
|
||||
replace = ""
|
||||
|
||||
print(key, seq, replace)
|
||||
|
||||
before = extract_fastq_seq(fasta, int(row["chromosome"]), row["startCoord"] - 1 - 100, row["startCoord"])
|
||||
after = extract_fastq_seq(fasta, int(row["chromosome"]), row["endCoord"], row["endCoord"] + 100)
|
||||
|
||||
|
||||
seq = f"{before}({seq}/{replace}){after}"
|
||||
data.append({"sequence_name": key, "editseq": seq})
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
data = pd.DataFrame(data)
|
||||
data.to_csv(outfile, index=False)
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from fire import Fire
|
||||
|
||||
# get_aa_coords(
|
||||
# "../metainfo/Cancer and blood disorder panels_v2.xlsx",
|
||||
# "../gene_coords/batch2"
|
||||
# )
|
||||
|
||||
# get_aa_coords(
|
||||
# "../metainfo/DDR gene library in 2021 Cell.xlsx",
|
||||
# "../gene_coords/positive"
|
||||
# )
|
||||
|
||||
|
||||
# # Fire({"aa": design_by_aa})
|
||||
# design_by_aa(
|
||||
# "../gene_coords/batch2",
|
||||
# fasta="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
|
||||
# output="../gene_aa_target_batch2.csv.gz"
|
||||
# )
|
||||
|
||||
# design_by_aa(
|
||||
# "../gene_coords/positive",
|
||||
# fasta="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
|
||||
# output="../gene_aa_target_positive.csv.gz",
|
||||
# stop_codon = True
|
||||
# )
|
||||
|
||||
# run_analysis(
|
||||
# "../gene_aa_target_batch2.csv.gz",
|
||||
# reference="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
|
||||
# outdir="../../prediction/input/batch2"
|
||||
# )
|
||||
|
||||
# run_analysis(
|
||||
# "../gene_aa_target_positive.csv.gz",
|
||||
# reference="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
|
||||
# outdir="../../prediction/input/positive"
|
||||
# )
|
||||
|
||||
# 生成snp结构,snp_info是整理完的snp信息
|
||||
# targets是记录了需要处理的基因
|
||||
# design_by_snp(
|
||||
# snp_info="../metainfo/副本FA家族基因-20250829-DJJ_XD.xlsx",
|
||||
# targets="../metainfo/实验计划.xlsx",
|
||||
# output="gene_snp_target.csv",
|
||||
# fasta="../ref/gencode/GRCh37.p13.genome.fa.gz",
|
||||
# fasta_hg38="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
|
||||
# genes="../gene_coords"
|
||||
# )
|
||||
|
||||
# design_by_hmgd(
|
||||
# "../metainfo/allmut.csv",
|
||||
# fasta="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
|
||||
# outfile="../../prediction/input/pos_v2.csv.gz"
|
||||
# )
|
||||
|
||||
|
||||
# url = "https://www.ebi.ac.uk/proteins/api/coordinates?accession=P21359-1"
|
||||
# download_uniprot_region("Test", "P21359")
|
||||
|
||||
|
||||
16
design/pyproject.toml
Normal file
16
design/pyproject.toml
Normal file
@@ -0,0 +1,16 @@
|
||||
[project]
|
||||
name = "pgrna"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"biopython>=1.85",
|
||||
"fire>=0.7.1",
|
||||
"loguru>=0.7.3",
|
||||
"openpyxl>=3.1.5",
|
||||
"pandas>=2.3.3",
|
||||
"pyfaidx>=0.9.0.3",
|
||||
"pyliftover>=0.4.1",
|
||||
"rich>=14.2.0",
|
||||
]
|
||||
233
design/src/editseq.py
Normal file
233
design/src/editseq.py
Normal file
@@ -0,0 +1,233 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
created by lanzl
|
||||
modified by zym
|
||||
"""
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
import re
|
||||
import sys
|
||||
from pyfaidx import Fasta, FetchError
|
||||
|
||||
HG19_FASTA_PATH = "/rawdata1/project/peRNA_design/ref/gencode/GRCh37.p13.genome.fa.gz"
|
||||
HG38_FASTA_PATH = "/rawdata1/project/peRNA_design/ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
|
||||
|
||||
|
||||
def parse_region(region_str: str) -> tuple:
|
||||
"""解析 'chrom:start-end' 格式的基因组区域,并确保染色体名带有 'chr' 前缀。"""
|
||||
match = re.match(r"(\w+):(\d+)-(\d+)", region_str)
|
||||
chrom, start, end = match.groups()
|
||||
|
||||
if not chrom.lower().startswith("chr"):
|
||||
chrom = "chr" + chrom
|
||||
|
||||
return chrom, int(start), int(end)
|
||||
|
||||
|
||||
def extract_orf_sequence(genome: Fasta, chrom: str, start: int, end: int) -> str:
|
||||
"""
|
||||
从预加载的 Fasta 对象中提取序列(1-based inclusive)。
|
||||
"""
|
||||
try:
|
||||
sequence = str(genome.get_seq(chrom, start, end))
|
||||
return sequence.upper()
|
||||
except (KeyError, FetchError) as e:
|
||||
alt_chrom = chrom
|
||||
if chrom.lower().startswith("chr"):
|
||||
alt_chrom = chrom[3:]
|
||||
|
||||
if alt_chrom != chrom:
|
||||
try:
|
||||
sequence = str(genome.get_seq(alt_chrom, start, end))
|
||||
return sequence.upper()
|
||||
except (KeyError, FetchError) as inner_e:
|
||||
raise FetchError(
|
||||
f"Requested rname '{chrom}' (also tried '{alt_chrom}') does not exist in FASTA index."
|
||||
) from inner_e
|
||||
raise e
|
||||
|
||||
|
||||
def generate_editseq(
|
||||
original: str,
|
||||
replacement: str,
|
||||
region_str: str,
|
||||
genome: Fasta,
|
||||
flank_size: int = 100,
|
||||
) -> str:
|
||||
"""构造 EditSeq 序列字符串(突变位点 + 100bp 侧翼)。"""
|
||||
chrom, mut_start, mut_end = parse_region(region_str)
|
||||
|
||||
# 计算侧翼坐标
|
||||
upstream_start = mut_start - flank_size
|
||||
upstream_end = mut_start - 1
|
||||
downstream_start = mut_end + 1
|
||||
downstream_end = mut_end + flank_size
|
||||
|
||||
# 提取侧翼序列
|
||||
upstream_flank = extract_orf_sequence(genome, chrom, upstream_start, upstream_end)
|
||||
downstream_flank = extract_orf_sequence(
|
||||
genome, chrom, downstream_start, downstream_end
|
||||
)
|
||||
|
||||
original = str(original).strip()
|
||||
replacement = str(replacement).strip()
|
||||
|
||||
# --- 突变逻辑:所有替换(等长或不等长)统一使用 (ORIGINAL/REPLACEMENT) 格式 ---
|
||||
mut_part = ""
|
||||
if original and replacement:
|
||||
# 替换或 Delins: (ORIGINAL/REPLACEMENT)
|
||||
mut_part = f"({original}/{replacement})"
|
||||
elif original:
|
||||
# 删除: (-ORIGINAL)
|
||||
mut_part = f"(-{original})"
|
||||
elif replacement:
|
||||
# 插入: (+REPLACEMENT)
|
||||
mut_part = f"(+{replacement})"
|
||||
else:
|
||||
mut_part = "(Invalid mutation logic)"
|
||||
|
||||
return f"{upstream_flank}{mut_part}{downstream_flank}"
|
||||
|
||||
|
||||
# --- 氨基酸突变处理 ---
|
||||
def process_aa_mutations(df_aa: pd.DataFrame, genome_hg38: Fasta) -> pd.DataFrame:
|
||||
"""处理氨基酸(AA)饱和诱变数据,并返回包含 EditSeq, strategy 和 mutation_type 的 DataFrame。"""
|
||||
results = []
|
||||
|
||||
# 性能优化:使用 to_dict('records') 替代 iterrows()
|
||||
for row in df_aa.to_dict("records"):
|
||||
original = (
|
||||
str(row["origial_code"]).strip() if pd.notna(row["origial_code"]) else ""
|
||||
)
|
||||
replacement = (
|
||||
str(row["mutation_code"]).strip() if pd.notna(row["mutation_code"]) else ""
|
||||
)
|
||||
|
||||
# 确定突变类型
|
||||
if original and replacement:
|
||||
mut_type = "REPL"
|
||||
elif original and not replacement:
|
||||
mut_type = "DEL"
|
||||
elif not original and replacement:
|
||||
mut_type = "INS"
|
||||
else:
|
||||
mut_type = "UNKNOWN"
|
||||
|
||||
# 生成序列名称
|
||||
seq_name = f"{row['gene']}_AA{row['aa_index']}_{row['origial_code']}_{row['mutation_code']}"
|
||||
|
||||
# 生成 EditSeq
|
||||
editseq = generate_editseq(
|
||||
original=row["origial_code"],
|
||||
replacement=row["mutation_code"],
|
||||
region_str=row["aa_region"],
|
||||
genome=genome_hg38,
|
||||
)
|
||||
|
||||
# 收集结果,包括 'strategy' 和 'mutation_type' 列
|
||||
results.append(
|
||||
{
|
||||
"sequence_name": seq_name,
|
||||
"editseq": editseq,
|
||||
"strategy": row["strategy"],
|
||||
"mutation_type": mut_type,
|
||||
}
|
||||
)
|
||||
|
||||
return pd.DataFrame(results)
|
||||
|
||||
|
||||
# --- SNP/cDNA 突变处理 ---
|
||||
def process_snp_mutations(
|
||||
df_snp: pd.DataFrame, genome_hg19: Fasta, genome_hg38: Fasta
|
||||
) -> pd.DataFrame:
|
||||
"""处理 SNP/cDNA 突变数据,返回包含 EditSeq 和 mutation_type 的 DataFrame。"""
|
||||
results = []
|
||||
|
||||
# 性能优化:使用 to_dict('records') 替代 iterrows()
|
||||
for row in df_snp.to_dict("records"):
|
||||
original = (
|
||||
str(row["original_codon"]).strip()
|
||||
if pd.notna(row["original_codon"])
|
||||
else ""
|
||||
)
|
||||
replacement = (
|
||||
str(row["mutation_code"]).strip() if pd.notna(row["mutation_code"]) else ""
|
||||
)
|
||||
|
||||
# 根据版本选择合适的 Fasta 对象
|
||||
version = str(row["version"]).lower()
|
||||
genome_to_use = genome_hg38 if version == "hg38" else genome_hg19
|
||||
|
||||
# 确定用于序列命名的突变类型
|
||||
if original and replacement:
|
||||
mut_type = "REPL"
|
||||
elif original and not replacement:
|
||||
mut_type = "DEL"
|
||||
elif not original and replacement:
|
||||
mut_type = "INS"
|
||||
else:
|
||||
mut_type = "UNKNOWN"
|
||||
|
||||
# 构造序列名称
|
||||
cdna_code_clean = str(row["cdna code"]).replace(".", "").replace("_", "p")
|
||||
seq_name = f"{row['gene']}_{mut_type}_{cdna_code_clean}"
|
||||
|
||||
# 生成 EditSeq
|
||||
editseq = generate_editseq(
|
||||
original=original,
|
||||
replacement=replacement,
|
||||
region_str=str(row["mutation_region"]),
|
||||
genome=genome_to_use,
|
||||
)
|
||||
|
||||
results.append(
|
||||
{"sequence_name": seq_name, "editseq": editseq, "mutation_type": mut_type}
|
||||
)
|
||||
|
||||
return pd.DataFrame(results)
|
||||
|
||||
|
||||
def run_analysis(infile, reference, outdir):
|
||||
# AA_INPUT_FILE = "/rawdata1/project/peRNA_design/gene_aa_target.csv"
|
||||
# SNP_INPUT_FILE = "/rawdata1/project/peRNA_design/gene_snp_target.csv"
|
||||
|
||||
# genome_hg19 = Fasta(HG19_FASTA_PATH)
|
||||
# genome_hg38 = Fasta(HG38_FASTA_PATH)
|
||||
genome = Fasta(reference)
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
|
||||
aa_df_input = pd.read_csv(infile)
|
||||
# snp_df_input = pd.read_csv(SNP_INPUT_FILE)
|
||||
|
||||
# --- 阶段一: 处理 SNP/cDNA 突变并输出 ---
|
||||
# snp_df = process_snp_mutations(snp_df_input, genome_hg19, genome_hg38)
|
||||
# snp_output_file = "snp_editseq_output.csv"
|
||||
# snp_df.to_csv(snp_output_file, index=False)
|
||||
|
||||
# --- 阶段二: 按 strategy 分组处理 AA 突变并输出 ---
|
||||
aa_df_input["strategy"] = aa_df_input["strategy"].str.upper()
|
||||
strategies = aa_df_input["strategy"].unique()
|
||||
|
||||
for strategy in strategies:
|
||||
# 跳过空的 strategy
|
||||
if pd.isna(strategy):
|
||||
continue
|
||||
|
||||
# 过滤 DataFrame
|
||||
aa_subset_df = aa_df_input[aa_df_input["strategy"] == strategy].copy()
|
||||
if aa_subset_df.empty:
|
||||
continue
|
||||
|
||||
# 处理子集
|
||||
aa_df_processed = process_aa_mutations(aa_subset_df, genome)
|
||||
|
||||
# 保存
|
||||
aa_output_file = f"aa_{strategy}_editseq_output.csv"
|
||||
aa_df_processed.to_csv(os.path.join(outdir, aa_output_file), index=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_analysis()
|
||||
53
design/src/liftover.py
Normal file
53
design/src/liftover.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import pandas as pd
|
||||
|
||||
from pyliftover import LiftOver
|
||||
|
||||
# lo = LiftOver('/home/zym/projects/pgRNA/liftover/hg19ToHg38.over.chain.gz')
|
||||
|
||||
from pyliftover import LiftOver
|
||||
|
||||
# 创建从 hg19 到 hg38 的转换器
|
||||
lo = LiftOver("hg19", "hg38")
|
||||
|
||||
|
||||
def convert_interval(chrom, start, end):
|
||||
"""
|
||||
将区间 (start, end) 从源基因组转换到目标基因组
|
||||
返回: (new_chrom, new_start, new_end) 或 None
|
||||
"""
|
||||
# 注意:pyliftover 使用 1-based 坐标
|
||||
# 如果你的 BED 是 0-based(如 BED 文件),start 需要 +1
|
||||
# 这里假设输入是 1-based;如果是 0-based,请用 start+1, end
|
||||
result_start = lo.convert_coordinate(chrom, start)
|
||||
result_end = lo.convert_coordinate(chrom, end)
|
||||
|
||||
if not result_start or not result_end:
|
||||
return None # 无法转换
|
||||
|
||||
# 取置信度最高的映射
|
||||
best_start = max(result_start, key=lambda x: x[3])
|
||||
best_end = max(result_end, key=lambda x: x[3])
|
||||
|
||||
new_chrom = best_start[0]
|
||||
new_start = best_start[1]
|
||||
new_end = best_end[1]
|
||||
|
||||
# 确保 start <= end
|
||||
if new_start >= new_end:
|
||||
new_start, new_end = new_end, new_start
|
||||
return new_chrom, new_start, new_end
|
||||
|
||||
|
||||
def get_seq(path, coord):
|
||||
import pysam
|
||||
|
||||
# 打开 FASTA 文件
|
||||
fasta = pysam.FastaFile(path) # 自动读取 genome.fa.fai
|
||||
|
||||
# 提取序列:chr1:1000000-1000100(0-based, [start, end))
|
||||
# 注意:pysam 使用 0-based 坐标,与 BED 一致
|
||||
return fasta.fetch(region=f"{coord[0]}:{coord[1]}-{coord[2]}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pass
|
||||
216
design/src/mutation.py
Normal file
216
design/src/mutation.py
Normal file
@@ -0,0 +1,216 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import itertools
|
||||
|
||||
from Bio.Seq import Seq
|
||||
from loguru import logger
|
||||
|
||||
# 遗传密码表:DNA -> 氨基酸
|
||||
codon_table = {
|
||||
"TTT": "F",
|
||||
"TTC": "F",
|
||||
"TTA": "L",
|
||||
"TTG": "L",
|
||||
"TCT": "S",
|
||||
"TCC": "S",
|
||||
"TCA": "S",
|
||||
"TCG": "S",
|
||||
"TAT": "Y",
|
||||
"TAC": "Y",
|
||||
"TAA": "*",
|
||||
"TAG": "*",
|
||||
"TGT": "C",
|
||||
"TGC": "C",
|
||||
"TGA": "*",
|
||||
"TGG": "W",
|
||||
"CTT": "L",
|
||||
"CTC": "L",
|
||||
"CTA": "L",
|
||||
"CTG": "L",
|
||||
"CCT": "P",
|
||||
"CCC": "P",
|
||||
"CCA": "P",
|
||||
"CCG": "P",
|
||||
"CAT": "H",
|
||||
"CAC": "H",
|
||||
"CAA": "Q",
|
||||
"CAG": "Q",
|
||||
"CGT": "R",
|
||||
"CGC": "R",
|
||||
"CGA": "R",
|
||||
"CGG": "R",
|
||||
"ATT": "I",
|
||||
"ATC": "I",
|
||||
"ATA": "I",
|
||||
"ATG": "M",
|
||||
"ACT": "T",
|
||||
"ACC": "T",
|
||||
"ACA": "T",
|
||||
"ACG": "T",
|
||||
"AAT": "N",
|
||||
"AAC": "N",
|
||||
"AAA": "K",
|
||||
"AAG": "K",
|
||||
"AGT": "S",
|
||||
"AGC": "S",
|
||||
"AGA": "R",
|
||||
"AGG": "R",
|
||||
"GTT": "V",
|
||||
"GTC": "V",
|
||||
"GTA": "V",
|
||||
"GTG": "V",
|
||||
"GCT": "A",
|
||||
"GCC": "A",
|
||||
"GCA": "A",
|
||||
"GCG": "A",
|
||||
"GAT": "D",
|
||||
"GAC": "D",
|
||||
"GAA": "E",
|
||||
"GAG": "E",
|
||||
"GGT": "G",
|
||||
"GGC": "G",
|
||||
"GGA": "G",
|
||||
"GGG": "G",
|
||||
}
|
||||
|
||||
# 反向查找:氨基酸 -> 密码子列表
|
||||
aa_to_codons = {}
|
||||
for codon, aa in codon_table.items():
|
||||
if aa not in aa_to_codons:
|
||||
aa_to_codons[aa] = []
|
||||
aa_to_codons[aa].append(codon)
|
||||
|
||||
# 碱基
|
||||
bases = ["A", "T", "G", "C"]
|
||||
|
||||
|
||||
def generate_nnn():
|
||||
"""生成所有 NNN 组合(64种)"""
|
||||
return ["".join(c) for c in itertools.product(bases, repeat=3)]
|
||||
|
||||
|
||||
def generate_2n_variants(original_codon, fixed_position=None):
|
||||
"""
|
||||
生成双N突变(两个位置随机,一个固定)
|
||||
fixed_position: 0,1,2 表示哪个位置保持不变(0=第一个碱基)
|
||||
若不指定,则生成所有三种模式:NNT, NTN, TNN
|
||||
"""
|
||||
variants = set()
|
||||
for pos in [fixed_position] if fixed_position is not None else [0, 1, 2]:
|
||||
fixed_base = original_codon[pos]
|
||||
for b1 in bases:
|
||||
for b2 in bases:
|
||||
codon_list = ["_", "_", "_"]
|
||||
codon_list[pos] = fixed_base
|
||||
idx = 0
|
||||
for i in range(3):
|
||||
if i != pos:
|
||||
codon_list[i] = [b1, b2][idx]
|
||||
idx += 1
|
||||
variant = "".join(codon_list)
|
||||
variants.add(variant)
|
||||
return sorted(variants)
|
||||
|
||||
|
||||
def generate_1n_variants(original_codon, fixed_positions=None):
|
||||
"""
|
||||
生成单N突变(一个位置随机,两个固定)
|
||||
fixed_positions: 如 [0,1] 表示第0和第1位固定
|
||||
若不指定,则生成所有三种模式:ANT, ATN, TAN
|
||||
"""
|
||||
variants = set()
|
||||
if fixed_positions:
|
||||
positions = [fixed_positions]
|
||||
else:
|
||||
positions = [[0, 1], [0, 2], [1, 2]]
|
||||
|
||||
for fix in positions:
|
||||
var_pos = 3 - sum(fix) # 剩下那个位置是变量
|
||||
for i in range(3):
|
||||
if i not in fix:
|
||||
var_pos = i
|
||||
break
|
||||
base1, base2 = original_codon[fix[0]], original_codon[fix[1]]
|
||||
for b in bases:
|
||||
codon_list = ["_", "_", "_"]
|
||||
codon_list[fix[0]] = base1
|
||||
codon_list[fix[1]] = base2
|
||||
codon_list[var_pos] = b
|
||||
variant = "".join(codon_list)
|
||||
variants.add(variant)
|
||||
return sorted(variants)
|
||||
|
||||
|
||||
def translate(codon):
|
||||
return codon_table.get(codon, "X")
|
||||
|
||||
|
||||
def design_mutations_for_orf(dna_seq, strategy="3N"):
|
||||
"""
|
||||
对整个 ORF 序列进行饱和突变设计
|
||||
strategy: '3N', '2N', '1N'
|
||||
"""
|
||||
if len(dna_seq) % 3 != 0:
|
||||
raise ValueError(f"ORF 长度必须是 3 的倍数!{dna_seq}")
|
||||
|
||||
num_codons = len(dna_seq) // 3
|
||||
results = []
|
||||
|
||||
for i in range(num_codons):
|
||||
start = i * 3
|
||||
end = start + 3
|
||||
orig_codon = dna_seq[start:end]
|
||||
orig_aa = translate(orig_codon)
|
||||
|
||||
logger.debug(
|
||||
f"\n--- 位点 {i + 1} (氨基酸 {i + 1}): {orig_aa} ({orig_codon}) ---"
|
||||
)
|
||||
|
||||
variants = []
|
||||
if strategy == "3N":
|
||||
variants = generate_nnn()
|
||||
logger.debug(f"策略: 3N (NNN) → 共 {len(variants)} 种组合")
|
||||
elif strategy == "2N":
|
||||
variants = generate_2n_variants(orig_codon)
|
||||
logger.debug(f"策略: 2N (任意两个随机) → 共 {len(variants)} 种组合")
|
||||
elif strategy == "1N":
|
||||
variants = generate_1n_variants(orig_codon)
|
||||
logger.debug(f"策略: 1N (任意一个随机) → 共 {len(variants)} 种组合")
|
||||
else:
|
||||
raise ValueError("strategy 必须是 '3N', '2N', 或 '1N'")
|
||||
|
||||
# 过滤掉无效密码子(理论上不会)
|
||||
valid_variants = [v for v in variants if len(v) == 3]
|
||||
|
||||
# 统计突变结果
|
||||
mutant_aa_count = {}
|
||||
stop_count = 0
|
||||
for v in valid_variants:
|
||||
aa = translate(v)
|
||||
if aa == "*":
|
||||
stop_count += 1
|
||||
mutant_aa_count[aa] = mutant_aa_count.get(aa, 0) + 1
|
||||
|
||||
logger.debug(f"→ 共产生 {len(valid_variants)} 个有效突变")
|
||||
logger.debug(f"→ 可产生 {len(mutant_aa_count)} 种不同氨基酸(含终止)")
|
||||
logger.debug(f"→ 引入终止密码子: {stop_count} 次")
|
||||
logger.debug(f"→ 氨基酸分布: {mutant_aa_count}")
|
||||
|
||||
results.append(
|
||||
{
|
||||
"position": i + 1,
|
||||
"original_codon": orig_codon,
|
||||
"original_aa": orig_aa,
|
||||
"variants": valid_variants,
|
||||
"variant_count": len(valid_variants),
|
||||
"mutant_aa_count": mutant_aa_count,
|
||||
"stop_count": stop_count,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pass
|
||||
236
design/src/reader.py
Normal file
236
design/src/reader.py
Normal file
@@ -0,0 +1,236 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
从gtf中读取CDS,并读取对应的sequence
|
||||
"""
|
||||
|
||||
import gzip
|
||||
|
||||
import pandas as pd
|
||||
from Bio.Seq import Seq
|
||||
from loguru import logger
|
||||
from pyfaidx import Fasta
|
||||
|
||||
|
||||
class Region(object):
|
||||
"""记录坐标位点,用于融合"""
|
||||
|
||||
def __init__(self, chrom, start, end, strand="+", kind=None):
|
||||
self.chrom = chrom
|
||||
self.start = start
|
||||
self.end = end
|
||||
self.strand = strand
|
||||
self.sequence = None
|
||||
self.kind_ = kind
|
||||
self.addition = None
|
||||
|
||||
@classmethod
|
||||
def create(cls, region):
|
||||
chrom, sites = region.split(":")
|
||||
sites = [int(x) for x in sites.split("-")]
|
||||
return cls(chrom, sites[0], sites[-1], "+")
|
||||
|
||||
def set_seq(self, sequence: str):
|
||||
self.sequence = sequence.upper()
|
||||
|
||||
# 如果是负链,需要反向互补
|
||||
if self.strand == "-":
|
||||
self.sequence = str(Seq(sequence).reverse_complement())
|
||||
|
||||
def __and__(self, other):
|
||||
if self.chrom != other.chrom:
|
||||
return False
|
||||
|
||||
return self.start < other.end and self.end > other.start
|
||||
|
||||
def __add__(self, other):
|
||||
if not self & other:
|
||||
raise ValueError("没有重合位点")
|
||||
|
||||
self.start = min(self.start, other.start)
|
||||
self.end = max(self.end, other.end)
|
||||
return self
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"{self.chrom}:{self.start}-{self.end}"
|
||||
|
||||
def __hash__(self):
|
||||
return hash(str(self))
|
||||
|
||||
def __len__(self):
|
||||
return self.end - self.start
|
||||
|
||||
@property
|
||||
def kind(self):
|
||||
if len(self) >= 3:
|
||||
return self.kind_
|
||||
if not self.kind_:
|
||||
return ""
|
||||
else:
|
||||
return f"cross_{self.kind_}"
|
||||
|
||||
|
||||
def read_gtf(gtf_path):
|
||||
"""
|
||||
读取 GTF 文件,返回 DataFrame
|
||||
"""
|
||||
logger.info("正在读取 GTF 文件...")
|
||||
columns = [
|
||||
"seqname",
|
||||
"source",
|
||||
"feature",
|
||||
"start",
|
||||
"end",
|
||||
"score",
|
||||
"strand",
|
||||
"frame",
|
||||
"attribute",
|
||||
]
|
||||
|
||||
df = pd.read_csv(
|
||||
gtf_path, sep="\t", comment="#", header=None, names=columns, low_memory=False
|
||||
)
|
||||
|
||||
# 过滤出 CDS 行
|
||||
cds_df = df[df["feature"] == "CDS"].copy()
|
||||
|
||||
# 解析 attribute 列,展开为多个列
|
||||
# 使用 pd.json_normalize 将字典列表转换为 DataFrame
|
||||
try:
|
||||
attributes_df = pd.json_normalize(cds_df["attribute"].apply(parse_attributes))
|
||||
except Exception as e:
|
||||
logger.error(f"解析 attribute 字段失败: {e}")
|
||||
raise
|
||||
|
||||
# 将原始列与解析后的属性列合并
|
||||
result_df = pd.concat([cds_df.reset_index(drop=True), attributes_df], axis=1)
|
||||
|
||||
logger.info(f"成功读取并解析 GTF 文件,共 {len(result_df)} 个 CDS 特征。")
|
||||
return result_df
|
||||
|
||||
|
||||
def parse_attributes(attr_str):
|
||||
"""
|
||||
解析 GTF 的 attribute 字段,返回字典
|
||||
"""
|
||||
attributes = {}
|
||||
for item in attr_str.split(";"):
|
||||
item = item.strip()
|
||||
if not item:
|
||||
continue
|
||||
if " " in item:
|
||||
key, value = item.split(" ", 1)
|
||||
attributes[key] = value.strip('"')
|
||||
return attributes
|
||||
|
||||
|
||||
def get_cds_for_gene(cds_df, gene_name):
|
||||
"""
|
||||
提取指定基因的所有 CDS 条目,并按转录本分组,选择最长的转录本
|
||||
"""
|
||||
logger.info(f"正在查找基因 '{gene_name}' 的 CDS...")
|
||||
|
||||
# 添加解析后的属性
|
||||
cds_df["attributes_parsed"] = cds_df["attribute"].apply(parse_attributes)
|
||||
|
||||
# 筛选包含该基因名的行
|
||||
gene_cds_list = []
|
||||
for idx, row in cds_df.iterrows():
|
||||
attrs = row["attributes_parsed"]
|
||||
if attrs.get("transcript_id") == gene_name:
|
||||
# if attrs.get('gene_name') == gene_name or attrs.get('gene_id').startswith(gene_name):
|
||||
gene_cds_list.append(row)
|
||||
|
||||
if not gene_cds_list:
|
||||
raise ValueError(f"未在 GTF 中找到基因 '{gene_name}'")
|
||||
|
||||
df = pd.DataFrame(gene_cds_list)
|
||||
df = df[
|
||||
["seqname", "feature", "start", "end", "strand", "transcript_id"]
|
||||
].drop_duplicates()
|
||||
|
||||
res = []
|
||||
last = None
|
||||
for _, row in df.iterrows():
|
||||
temp = Region(
|
||||
str(row["seqname"]),
|
||||
row["start"],
|
||||
row["end"],
|
||||
str(row["strand"]),
|
||||
row["transcript_id"],
|
||||
)
|
||||
if last is None:
|
||||
last = temp
|
||||
elif temp & last:
|
||||
last = last + temp
|
||||
else:
|
||||
res.append(last)
|
||||
last = temp
|
||||
if last not in res:
|
||||
res.append(last)
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def load_uniprot_region(path):
|
||||
res = []
|
||||
last = None
|
||||
with open(path) as r:
|
||||
for line in r:
|
||||
if line.startswith("#"):
|
||||
continue
|
||||
temp = Region.create(line.split()[0])
|
||||
if last is None:
|
||||
last = temp
|
||||
elif temp & last:
|
||||
last = last + temp
|
||||
else:
|
||||
res.append(last)
|
||||
last = temp
|
||||
|
||||
if last not in res:
|
||||
res.append(last)
|
||||
return res
|
||||
|
||||
|
||||
def extract_orf_sequence(genome_fasta, cds_rows, half_open=False):
|
||||
"""
|
||||
从参考基因组中提取 CDS 并拼接成 ORF
|
||||
"""
|
||||
|
||||
if not cds_rows:
|
||||
raise ValueError("not cds")
|
||||
|
||||
seqname = cds_rows[0].chrom
|
||||
strand = cds_rows[0].strand
|
||||
|
||||
logger.debug(f"从参考基因组提取序列 (chr{seqname})...")
|
||||
genome = Fasta(genome_fasta)
|
||||
|
||||
# 获取染色体序列
|
||||
try:
|
||||
chrom_seq = genome[seqname] # 如 "chr1", "1" 等,根据 FASTA 命名调整
|
||||
except KeyError:
|
||||
if "chr" in seqname:
|
||||
seqname = seqname.replace("chr", "")
|
||||
else:
|
||||
seqname = "chr" + seqname
|
||||
chrom_seq = genome[seqname] # 如 "chr1", "1" 等,根据 FASTA 命名调整
|
||||
|
||||
for row in cds_rows:
|
||||
start = int(row.start) - 1 # GTF 是 1-based,pyfaidx 是 0-based
|
||||
end = int(row.end) - (1 if half_open else 0)
|
||||
|
||||
if len(row) < 3 and "cross" in row.kind:
|
||||
if row.kind == "cross_start":
|
||||
start = end - 3
|
||||
else:
|
||||
end = start + 3
|
||||
|
||||
row.set_seq(chrom_seq[start:end].seq)
|
||||
|
||||
return cds_rows
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pass
|
||||
139
design/src/safe_target.py
Normal file
139
design/src/safe_target.py
Normal file
@@ -0,0 +1,139 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
生成safe targeting 序列
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import random
|
||||
import pysam
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
seed = 42
|
||||
random.seed(42)
|
||||
|
||||
|
||||
__AAs__ = {
|
||||
"丙氨酸": ["GCU", "GCC", "GCA", "GCG"],
|
||||
"精氨酸": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"],
|
||||
"天冬酰胺": ["AAU", "AAC"],
|
||||
"天冬氨酸": ["GAU", "GAC"],
|
||||
"半胱氨酸": ["UGU", "UGC"],
|
||||
"谷氨酰胺": ["CAA", "CAG"],
|
||||
"谷氨酸": ["GAA", "GAG"],
|
||||
"甘氨酸": ["GGU", "GGC", "GGA", "GGG"],
|
||||
"组氨酸": ["CAU", "CAC"],
|
||||
"异亮氨酸": ["AUU", "AUC", "AUA"],
|
||||
"亮氨酸": ["UUA", "UUG", "CUU", "CUC", "CUA", "CUG"],
|
||||
"赖氨酸": ["AAA", "AAG"],
|
||||
"甲硫氨酸": ["AUG"],
|
||||
"苯丙氨酸": ["UUU", "UUC"],
|
||||
"脯氨酸": ["CCU", "CCC", "CCA", "CCG"],
|
||||
"丝氨酸": ["UCU", "UCC", "UCA", "UCG", "AGU", "AGC"],
|
||||
"苏氨酸": ["ACU", "ACC", "ACA", "ACG"],
|
||||
"色氨酸": ["UGG"],
|
||||
"酪氨酸": ["UAU", "UAC"],
|
||||
"缬氨酸": ["GUU", "GUC", "GUA", "GUG"],
|
||||
"终止密码子": ["UAA", "UAG", "UGA"],
|
||||
}
|
||||
|
||||
|
||||
def codons():
|
||||
for key, values in __AAs__.items():
|
||||
yield key, [value.replace("U", "T") for value in values]
|
||||
|
||||
|
||||
class Region:
|
||||
"""Represents a 3bp codon region in the full sequence."""
|
||||
|
||||
def __init__(self, chrom: str, start: int, end: int):
|
||||
self.chrom = chrom
|
||||
self.start = start
|
||||
self.end = end
|
||||
self.__shift__ = 0
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.chrom}:{self.start}-{self.end}"
|
||||
|
||||
def shift(self, fasta: str):
|
||||
for i in range(0, self.end - self.start):
|
||||
if self.__shift__ != 0:
|
||||
break
|
||||
seq = extract_fastq_seq(fasta, Region(self.chrom, self.start+i, self.start+i+3))
|
||||
|
||||
for _, values in codons():
|
||||
if seq in values:
|
||||
self.__shift__ = i
|
||||
break
|
||||
|
||||
def choose(self, number: int = 3):
|
||||
length_of_codon = 3
|
||||
|
||||
regions = []
|
||||
for i in range(self.start + self.__shift__, self.end, length_of_codon):
|
||||
if i + length_of_codon > self.end:
|
||||
break
|
||||
regions.append([i, i + length_of_codon])
|
||||
|
||||
# np.choice(my_list, size=3, replace=False)
|
||||
|
||||
if number > len(regions):
|
||||
return [Region(self.chrom, x[0], x[1]) for x in regions]
|
||||
|
||||
return [Region(self.chrom, x[0], x[1]) for x in random.sample(regions, number)]
|
||||
|
||||
|
||||
def extract_fastq_seq(fastq: str, region: Region, seq_len: int = 100):
|
||||
with pysam.FastaFile(fastq) as fh:
|
||||
rec = fh.fetch(region.chrom, region.start, region.end)
|
||||
# print(rec)
|
||||
return rec
|
||||
|
||||
|
||||
def mutation(seq: str):
|
||||
random.seed(seed)
|
||||
for key, value in __AAs__.items():
|
||||
if seq in value:
|
||||
random_keys = random.sample([x for x in __AAs__.keys() if x != key], 1)[0]
|
||||
return random.sample(__AAs__[random_keys], 1)[0].replace("U", "T")
|
||||
|
||||
|
||||
|
||||
def main(infile, outfile, reference = "../ref/UCSC/hg19.fa.gz", seq_len: int = 100):
|
||||
|
||||
meta = pd.read_excel(infile, sheet_name="Human Safe Regions", header=None)
|
||||
meta = meta.sample(n=2000, random_state=seed)
|
||||
|
||||
data = []
|
||||
for idx in tqdm(meta.iloc[:, 0], total=meta.shape[0]):
|
||||
|
||||
idx = idx.split(";")
|
||||
region = Region(idx[0], int(idx[1]), int(idx[2]))
|
||||
region.shift(reference)
|
||||
|
||||
regions = region.choose(5)
|
||||
|
||||
for reg in regions:
|
||||
seq = extract_fastq_seq(reference, reg)
|
||||
mut = mutation(seq)
|
||||
|
||||
if seq is None or mut is None:
|
||||
continue
|
||||
|
||||
key = str(reg) + "_" + seq + "_" + mut
|
||||
before = extract_fastq_seq(reference, Region(region.chrom, reg.start - seq_len, reg.start))
|
||||
after = extract_fastq_seq(reference, Region(region.chrom, reg.end, reg.end + seq_len))
|
||||
|
||||
seq = f"{before}({seq}/{mut}){after}"
|
||||
data.append({"sequence_name": key, "editseq": seq})
|
||||
|
||||
data = pd.DataFrame(data)
|
||||
data.to_csv(outfile, index=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from fire import Fire
|
||||
Fire(main)
|
||||
|
||||
114
design/src/snp.py
Normal file
114
design/src/snp.py
Normal file
@@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""用来解析snp错配信息"""
|
||||
|
||||
import re
|
||||
from itertools import product
|
||||
|
||||
|
||||
def generate_sequences_with_combinations(seq):
|
||||
"""
|
||||
将 DNA 序列中连续的 N 替换为所有可能的 A/T/C/G 组合,
|
||||
返回所有可能的序列列表。
|
||||
|
||||
参数:
|
||||
seq (str): 输入的 DNA 序列,可包含 N
|
||||
|
||||
返回:
|
||||
list: 所有可能的序列(字符串列表)
|
||||
"""
|
||||
if "N" not in seq:
|
||||
return [seq]
|
||||
|
||||
# 分割序列,保留分隔符信息
|
||||
segments = []
|
||||
i = 0
|
||||
while i < len(seq):
|
||||
if seq[i] == "N":
|
||||
j = i
|
||||
while j < len(seq) and seq[j] == "N":
|
||||
j += 1
|
||||
length = j - i
|
||||
segments.append(("N", length)) # ('N', 3) 表示连续3个N
|
||||
i = j
|
||||
else:
|
||||
j = i
|
||||
while j < len(seq) and seq[j] != "N":
|
||||
j += 1
|
||||
segments.append(("seq", seq[i:j]))
|
||||
i = j
|
||||
|
||||
# 提取每个 N 块的可能组合
|
||||
n_block_options = []
|
||||
for seg_type, content in segments:
|
||||
if seg_type == "N":
|
||||
# 生成所有长度为 content 的 ATCG 组合
|
||||
options = ["".join(p) for p in product("ATCG", repeat=content)]
|
||||
n_block_options.append(options)
|
||||
|
||||
# 如果没有 N,直接返回原序列
|
||||
if not n_block_options:
|
||||
return [seq]
|
||||
|
||||
# 使用 itertools.product 生成所有组合
|
||||
from itertools import product as iter_product
|
||||
|
||||
all_combinations = list(iter_product(*n_block_options))
|
||||
|
||||
# 构建所有可能的序列
|
||||
results = []
|
||||
for combo in all_combinations:
|
||||
new_seq = ""
|
||||
n_index = 0
|
||||
for seg_type, content in segments:
|
||||
if seg_type == "seq":
|
||||
new_seq += content
|
||||
elif seg_type == "N":
|
||||
new_seq += combo[n_index]
|
||||
n_index += 1
|
||||
results.append(new_seq)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def decode_snp(label, ref_start=0):
|
||||
if label is None:
|
||||
return ""
|
||||
|
||||
if ":" in label:
|
||||
label = label.split(":")[-1]
|
||||
|
||||
if ref_start <= 0 and not label.startswith("g."):
|
||||
raise ValueError(f"{label} not genomic label")
|
||||
elif ref_start > 0 and not label.startswith("c."):
|
||||
raise ValueError(f"{label} not cdna label")
|
||||
|
||||
label = re.sub(r"([cg]\.|\[\d+\])", "", label)
|
||||
|
||||
sites = []
|
||||
|
||||
for x in label.split("_"):
|
||||
if not x:
|
||||
continue
|
||||
|
||||
x = re.sub(r"[^\d\+-]", "", x)
|
||||
if "+" in x:
|
||||
x = [int(y) for y in x.split("+")]
|
||||
x = x[0] + x[-1]
|
||||
elif "-" in x:
|
||||
x = [int(y) for y in x.split("-")]
|
||||
x = x[0] + x[-1]
|
||||
else:
|
||||
x = int(x)
|
||||
|
||||
sites.append(x + ref_start)
|
||||
|
||||
sites = sorted(sites)
|
||||
|
||||
rule = re.sub(r"[\d_\+-]", "", label)
|
||||
return sites, rule.strip()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pass
|
||||
|
||||
263
design/src/snv-N-2N-3N.py
Normal file
263
design/src/snv-N-2N-3N.py
Normal file
@@ -0,0 +1,263 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import itertools
|
||||
import random
|
||||
import gzip
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
import pandas as pd
|
||||
from loguru import logger
|
||||
|
||||
# ps
|
||||
# 将目标序列按3bp密码子分割,每个密码子系统性地生成三类突变
|
||||
# 为每个突变自动附加100bp的上下游侧翼序列,每类随机抽取150个突变
|
||||
|
||||
# Mutation design constants
|
||||
NUCLEOTIDES = ["A", "T", "C", "G"]
|
||||
UPSTREAM_LEN = 100
|
||||
DOWNSTREAM_LEN = 100
|
||||
TARGET_MUTATIONS = 150
|
||||
|
||||
|
||||
class Region:
|
||||
"""Represents a 3bp codon region in the full sequence."""
|
||||
|
||||
def __init__(
|
||||
self, chrom: str, start: int, end: int, sequence: str, absolute_index: int
|
||||
):
|
||||
self.chrom = chrom
|
||||
self.start = start
|
||||
self.end = end
|
||||
self.sequence = sequence.upper()
|
||||
self.absolute_index = absolute_index
|
||||
|
||||
|
||||
def read_fasta(fasta_path: str) -> Dict[str, str]:
|
||||
"""Parses FASTA file, returning {header: sequence}."""
|
||||
sequences = {}
|
||||
current_header: Optional[str] = None
|
||||
opener = gzip.open if fasta_path.endswith(".gz") else open
|
||||
|
||||
if not os.path.exists(fasta_path):
|
||||
logger.error(f"FASTA file not found: {fasta_path}")
|
||||
return {}
|
||||
|
||||
with opener(fasta_path, "rt") as f:
|
||||
current_seq: List[str] = []
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line.startswith(">"):
|
||||
if current_header and current_seq:
|
||||
sequences[current_header] = (
|
||||
"".join(current_seq).upper().replace("U", "T")
|
||||
)
|
||||
|
||||
current_header = line[1:].split()[0]
|
||||
current_seq = []
|
||||
else:
|
||||
if current_header:
|
||||
current_seq.append(line)
|
||||
|
||||
if current_header and current_seq:
|
||||
sequences[current_header] = "".join(current_seq).upper().replace("U", "T")
|
||||
|
||||
return sequences
|
||||
|
||||
|
||||
def split_sequence_to_codons(full_seq: str, gene_name: str) -> List[Region]:
|
||||
"""Splits full sequence into 3bp Region objects."""
|
||||
regions: List[Region] = []
|
||||
|
||||
for i in range(0, len(full_seq), 3):
|
||||
codon = full_seq[i : i + 3]
|
||||
if len(codon) == 3:
|
||||
regions.append(Region(gene_name, i, i + 2, codon, i))
|
||||
return regions
|
||||
|
||||
|
||||
def analyze_variant(ref: str, alt: str) -> str:
|
||||
"""Simplifies substitution representation (e.g., 'CAT'->'CGT' becomes 'C(A/G)T')."""
|
||||
if len(ref) != len(alt):
|
||||
return f"({ref}/{alt})"
|
||||
|
||||
diffs = [
|
||||
{"index": i, "ref_base": ref[i], "alt_base": alt[i]}
|
||||
for i in range(len(ref))
|
||||
if ref[i] != alt[i]
|
||||
]
|
||||
|
||||
if not diffs:
|
||||
return ref
|
||||
|
||||
positions = [d["index"] for d in diffs]
|
||||
is_consecutive = all(
|
||||
positions[i + 1] - positions[i] == 1 for i in range(len(positions) - 1)
|
||||
)
|
||||
|
||||
# Rule 1: Continuous differences (e.g., 'CAT'->'CGC' becomes 'C(AT/GC)')
|
||||
if is_consecutive:
|
||||
start_pos, end_pos = positions[0], positions[-1] + 1
|
||||
return f"{ref[:start_pos]}({ref[start_pos:end_pos]}/{alt[start_pos:end_pos]}){ref[end_pos:]}"
|
||||
|
||||
# Rule 2: Intermittent differences (e.g., 'GTT'->'GCG' becomes 'G(T/C)(T/G)')
|
||||
out = []
|
||||
prev_end = 0
|
||||
for d in diffs:
|
||||
pos, r, a = d["index"], d["ref_base"], d["alt_base"]
|
||||
out.append(ref[prev_end:pos])
|
||||
out.append(f"({r}/{a})")
|
||||
prev_end = pos + 1
|
||||
out.append(ref[prev_end:])
|
||||
|
||||
return "".join(out)
|
||||
|
||||
|
||||
def generate_codon_mutations(original_codon: str, n_mutations: int) -> List[str]:
|
||||
"""Generates all codon variants with exactly n_mutations."""
|
||||
mutants = set()
|
||||
codon_length = len(original_codon)
|
||||
|
||||
for indices in itertools.combinations(range(codon_length), n_mutations):
|
||||
base_options: List[List[str]] = []
|
||||
for i in range(codon_length):
|
||||
if i in indices:
|
||||
options = [b for b in NUCLEOTIDES if b != original_codon[i]]
|
||||
else:
|
||||
options = [original_codon[i]]
|
||||
base_options.append(options)
|
||||
|
||||
for combination in itertools.product(*base_options):
|
||||
mutant_codon = "".join(combination)
|
||||
if mutant_codon != original_codon:
|
||||
mutants.add(mutant_codon)
|
||||
|
||||
return sorted(list(mutants))
|
||||
|
||||
|
||||
def generate_editseq_and_metadata(
|
||||
full_seq: str, regions: List[Region], gene_name: str
|
||||
) -> pd.DataFrame:
|
||||
"""Generates all mutations (1N, 2N, 3N) and constructs the final DataFrame."""
|
||||
df_list: List[Dict[str, str]] = []
|
||||
|
||||
for idx, x in enumerate(regions):
|
||||
abs_start = x.absolute_index
|
||||
original_codon = x.sequence
|
||||
|
||||
# 1. Extract flanking sequences
|
||||
flank_up = full_seq[max(0, abs_start - UPSTREAM_LEN) : abs_start]
|
||||
flank_down = full_seq[
|
||||
abs_start + 3 : min(len(full_seq), abs_start + 3 + DOWNSTREAM_LEN)
|
||||
]
|
||||
|
||||
for strategy, n_mut in [("3N", 3), ("2N", 2), ("1N", 1)]:
|
||||
variants = generate_codon_mutations(original_codon, n_mut)
|
||||
|
||||
for mutation_codon in variants:
|
||||
# Use analyze_variant to simplify representation
|
||||
simplified_codon = analyze_variant(original_codon, mutation_codon)
|
||||
|
||||
# sequence_name: GENE_SUB_STRATEGY_AAINDEX_ORIGINAL>MUTATION
|
||||
seq_name = f"{gene_name}_SUB_{strategy}_AA{idx + 1}_{original_codon}>{mutation_codon}"
|
||||
|
||||
# editseq: flank_up + simplified_codon + flank_down
|
||||
edit_seq = f"{flank_up}{simplified_codon}{flank_down}"
|
||||
|
||||
df_list.append(
|
||||
{
|
||||
"sequence_name": seq_name,
|
||||
"editseq": edit_seq,
|
||||
"strategy": strategy,
|
||||
"mutation_type": "REPL", # Replacement
|
||||
}
|
||||
)
|
||||
|
||||
return pd.DataFrame(df_list)
|
||||
|
||||
|
||||
def run_mutation_design(fasta_file: str, gene_name: str, output_base_name: str):
|
||||
"""Executes the mutation design pipeline and saves 3 separate files."""
|
||||
|
||||
logger.info(f"Targeting gene: {gene_name}")
|
||||
fasta_data = read_fasta(fasta_file)
|
||||
full_seq, target_id = "", ""
|
||||
|
||||
# Locate target sequence
|
||||
for seq_id, seq in fasta_data.items():
|
||||
if gene_name.upper() in seq_id.upper():
|
||||
full_seq = seq
|
||||
target_id = seq_id
|
||||
break
|
||||
|
||||
if not full_seq and fasta_data:
|
||||
# Fallback: use longest sequence
|
||||
target_id, full_seq = max(fasta_data.items(), key=lambda item: len(item[1]))
|
||||
if full_seq:
|
||||
logger.warning(
|
||||
f"Using longest sequence ID: {target_id} (Length: {len(full_seq)} bp)"
|
||||
)
|
||||
|
||||
if not full_seq:
|
||||
logger.error(f"Failed to extract target sequence.")
|
||||
return
|
||||
|
||||
logger.info(f"Target sequence ID: {target_id}, Length: {len(full_seq)} bp")
|
||||
|
||||
# 1. Generate ALL mutations (1N, 2N, 3N)
|
||||
cds_regions = split_sequence_to_codons(full_seq, gene_name)
|
||||
all_mutations_df = generate_editseq_and_metadata(full_seq, cds_regions, gene_name)
|
||||
|
||||
# 2. Process and save
|
||||
strategies = ["1N", "2N", "3N"]
|
||||
|
||||
for strategy in strategies:
|
||||
# Filter for the current strategy
|
||||
strategy_df = all_mutations_df[all_mutations_df["strategy"] == strategy].copy()
|
||||
original_count = len(strategy_df)
|
||||
|
||||
# Determine output file name (e.g., AAVS1_1N_150_mutations.csv)
|
||||
output_file_name = output_base_name.replace("{strategy}", strategy)
|
||||
|
||||
if original_count == 0:
|
||||
logger.warning(
|
||||
f"Strategy {strategy}: No mutations generated. Skipping file creation for {output_file_name}."
|
||||
)
|
||||
continue
|
||||
|
||||
# Random sampling for the current strategy
|
||||
if original_count > TARGET_MUTATIONS:
|
||||
final_df = strategy_df.sample(n=TARGET_MUTATIONS, random_state=42)
|
||||
logger.success(
|
||||
f"Strategy {strategy}: Sampled {TARGET_MUTATIONS} mutations from {original_count} designs."
|
||||
)
|
||||
else:
|
||||
final_df = strategy_df
|
||||
logger.warning(
|
||||
f"Strategy {strategy}: Generated {original_count} mutations; saving all."
|
||||
)
|
||||
|
||||
# Save result, ensuring column order
|
||||
final_df[["sequence_name", "editseq", "strategy", "mutation_type"]].to_csv(
|
||||
output_file_name, index=False
|
||||
)
|
||||
logger.success(f"Strategy {strategy}: Design saved to {output_file_name}.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
AAVS1_FASTA_PATH = (
|
||||
"/rawdata1/project/peRNA_design/ref/AAVS1/ncbi_dataset/data/rna.fna"
|
||||
)
|
||||
GENE_NAME = "AAVS1"
|
||||
OUTPUT_BASE_NAME = "AAVS1_{strategy}_150_mutations.csv"
|
||||
|
||||
run_mutation_design(
|
||||
fasta_file=AAVS1_FASTA_PATH,
|
||||
gene_name=GENE_NAME,
|
||||
output_base_name=OUTPUT_BASE_NAME,
|
||||
)
|
||||
Reference in New Issue
Block a user