提交饱和编辑的相关设计,及检验代码

This commit is contained in:
2026-02-26 14:02:42 +08:00
commit cb556b47c0
36 changed files with 5437 additions and 0 deletions

450
design/main.py Normal file
View File

@@ -0,0 +1,450 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import math
import os
import re
import sys
from glob import glob
import requests as rq
import pandas as pd
from loguru import logger
import numpy as np
from src.mutation import design_mutations_for_orf
from src.reader import (extract_orf_sequence, get_cds_for_gene,
load_uniprot_region, read_gtf, Region)
from src.liftover import convert_interval
from src.snp import decode_snp, generate_sequences_with_combinations
import itertools
from src.editseq import run_analysis
# 清除默认的 handler
logger.remove()
# 添加一个只输出 INFO 及以上级别日志的 sink如控制台
# logger.add(level="INFO")
logger.add(
sys.stderr,
colorize=True,
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>",
# <cyan>{name}</cyan>: <cyan>{function}</cyan>: <cyan>{line}</cyan>
level="INFO"
)
def split_regions(cds):
u"""
切分原本的cds为3bp的氨基酸reigon
测试用例
14:103698801-103699017
14:103699133-103699179
14:103699364-103699576
14:103703173-103703327
14:103707003-103707215
14:103708522-103708659
14:103711033-103711087
"""
regions = []
cds = sorted(cds, key=lambda x: (x.chrom, x.start, x.end))
aa_codon_len = 3
start = 0
for x in cds:
# 如果start为0则直接从目前的区域开始
if start == 0:
start = x.start
elif start < 0:
# 如果start为负值说明上一个cds并不能完整划分为不同的aa
# 因此,需要单独将起始的小区域单独写出来
regions.append(Region(x.chrom, x.start, x.start - start, kind="start"))
regions[-1].addition = x
start = x.start - start
while start + aa_codon_len <= x.end:
# 记录下是否跨边界,以及跨的是哪一个边界
code = "regular"
if start == x.start:
code = "start"
elif start + aa_codon_len == x.end:
code = "end"
regions.append(Region(x.chrom, start, start + aa_codon_len, kind=code))
regions[-1].addition = x
start += aa_codon_len
if start < x.end:
# 如果是跨到end边界上了那么就记录跨的边界
regions.append(Region(x.chrom, start, x.end, kind="end"))
regions[-1].addition = x
start = start - x.end + 1
else:
# 如果没有则把start的指针归零
start = 0
return regions
def download_uniprot_region(protein, output):
resp = output.replace(".tsv", ".json")
url = f"https://www.ebi.ac.uk/proteins/api/coordinates?accession={protein}"
if os.path.exists(resp):
with open(resp, "r") as r:
resp = json.load(r)
else:
resp = rq.get(url, headers={"Accept": "application/json"})
resp = resp.json()
with open(output.replace(".tsv", ".json"), "w+") as w:
json.dump(resp, w, indent=4)
if not resp[0]["name"].endswith("HUMAN"):
raise ValueError(f"protein is not human")
__chroms__ = [str(x) for x in range(1, 23)] + ["chr" + str(x) for x in range(1, 23)] + ["X", "Y", "chrX", "chrY"]
with open(output, "w+") as w:
w.write(f"#{url}\n")
for coord in resp[0]["gnCoordinate"]:
chromosome = coord["genomicLocation"]["chromosome"]
if chromosome not in __chroms__:
continue
for row in coord["genomicLocation"]["exon"]:
genome = row["genomeLocation"]
genome = str(genome["begin"]["position"]) + "-" + str(genome["end"]["position"])
protein = row["proteinLocation"]
if "end" not in protein and "position" in protein:
protein = [str(protein["position"]["position"]), "-", str(protein["position"]["position"])]
else:
protein = [str(protein["begin"]["position"]), "-", str(protein["end"]["position"])]
row = f"{chromosome}:{genome}\t{'\t'.join(protein)}"
w.write(row + "\n")
break
def get_aa_coords(genes, output):
os.makedirs(output, exist_ok=True)
df = pd.read_excel(genes)
# df = df.loc[df["Batch"] == 1, :]
for _, row in df.iterrows():
gene_name = row[1]
url = f"https://rest.uniprot.org/uniprotkb/search?query=gene:{gene_name}+AND+organism_id:9606+AND+reviewed:true&format=json"
resp = rq.get(url)
for row in resp.json().get("results", []):
if "HUMAN" in row["uniProtkbId"]:
priority = row["primaryAccession"]
download_uniprot_region(priority, os.path.join(output, f"{gene_name}_{priority}.tsv"))
break
def adjust_cross_border_region(row):
start = row.start
end = row.end
if len(row) < 3 and "cross" in row.kind:
if row.kind == "cross_start":
start = end - 3
else:
end = start + 3
return f"{row.chrom}:{start}-{end}"
return str(row)
def design_by_aa(genes, fasta, output, stop_codon = False):
u""" 根据氨基酸设计错配的位点和错配规则 """
df = []
for gene in glob(os.path.join(genes, "*.tsv")):
logger.info(f"开始设计突变 {gene}...")
key = os.path.basename(gene).split(".")[0]
# 读取已有的区域
cds = load_uniprot_region(gene)
# 按照氨基酸位置划分
cds = split_regions(cds)
if not cds:
continue
# 提取序列
cds = extract_orf_sequence(fasta, cds, half_open=True)
for idx, x in enumerate(cds):
for strategy in ["3N"]:
results = design_mutations_for_orf(x.sequence, strategy=strategy)
for res in results:
for var in res["variants"]:
if var == res["original_codon"]:
continue
# 如果1个bp在起点则说明2bp在前边end该位点的前2bp必须与记录的内含子相同
if "cross_start" == x.kind and len(x) == 1 and var[:2] != x.sequence[:2]:
continue
# 如果2bp在起点则说明1bp在前边end则该位点的第一个碱基必须为内含子相同位点
elif "cross_start" == x.kind and len(x) == 2 and var[0] != x.sequence[0]:
continue
# 如果1bp在end则说明2bp在后边则该位点的2bp位点必须与内含子相同
elif "cross_end" == x.kind and len(x) == 1 and var[1:] != x.sequence[1:]:
continue
# 如果1bp在end则说明1bp在后边则该位点的1bp位点必须为C或G
elif "cross_end" == x.kind and len(x) == 2 and var[-1] not in ["C", "G"]:
continue
row = [key, str(x.addition), idx+1, str(x), adjust_cross_border_region(x), x.kind, strategy, res["original_codon"], var]
df.append(row)
df = pd.DataFrame(df)
df.columns = ["gene", "cds_region", "aa_index", "aa_region", "region_with_intron", "cross_cds_border", "strategy",
"origial_code", "mutation_code"]
strategy = []
for _, row in df.iterrows():
match = np.sum([x == y for x, y in zip(row["origial_code"], row["mutation_code"])])
strategy.append(f"{3-match}N")
df["strategy"] = strategy
if stop_codon:
df = df[df["mutation_code"].isin(["TAA", "TAG", "TGA"])]
df.to_csv(output, index = False)
def design_by_snp(snp_info, targets, genes, fasta, fasta_hg38, output):
logger.info("读取染色体")
chroms = {}
starts = {}
for gene in glob(os.path.join(genes, "*.tsv")):
key = os.path.basename(gene).split(".")[0]
cds = load_uniprot_region(gene)
cds = sorted(cds, key=lambda x:[x.chrom, x.start, x.end])
chroms[key] = cds[0].chrom
starts[key] = cds[0].start
logger.info(f"读取snp的信息{snp_info}")
all_sheets = pd.read_excel(snp_info, sheet_name=None)
# 遍历所有工作表
res = {}
for sheet_name, df in all_sheets.items():
temp = {}
for _, row in df.iterrows():
cdna = row["DNA change (cDNA) "]
hg38 = row["DNA change (genomic) (hg19)     "]
temp[cdna] = hg38
for sheet in re.split(r"[\(\s\)]", sheet_name):
res[sheet] = temp
print(res.keys())
logger.info(f"读取目标:{targets}")
df = pd.read_excel(targets, sheet_name=2)
with open(output, "w+") as w:
w.write(",".join(["gene", "cdna code", "genomic code", "mutation_region", "version", "original_codon", "mutation_code"]) + "\n")
for column in df.columns:
if "Unnamed" in column:
continue
for code in df[column]:
if not isinstance(code, str) and math.isnan(code):
continue
genomic_code = res.get(column, {}).get(code)
if genomic_code:
sites, rule = decode_snp(genomic_code)
elif str(code).startswith("c."):
sites, rule = decode_snp(code, ref_start=starts["FANCD2" if column == "FAND2" else column])
else:
continue
region = Region(chroms["FANCD2" if column == "FAND2" else column], start=sites[0], end=sites[-1])
hg38 = False
if genomic_code:
region = extract_orf_sequence(fasta, [region])[0]
elif str(code).startswith("c."):
hg38 = True
region = extract_orf_sequence(fasta_hg38, [region])[0]
original, replacement = "", ""
if ">" in rule:
original, replacement = rule.split(">")
original = region.sequence
elif rule == "dup":
original = region.sequence
replacement = original * 2
elif rule == "del":
original = region.sequence
replacement = ""
elif rule == "ins":
replacement = region.sequence
elif "delins" in rule:
original = region.sequence
replacement = rule.replace("delins", "")
elif "ins" in rule:
original = region.sequence
replacement = rule.replace("ins", "")
if not genomic_code:
genomic_code = ""
# 序列中所有N替换后的排列组合
for o, r in itertools.product(generate_sequences_with_combinations(original), generate_sequences_with_combinations(replacement)):
w.write(",".join([column, code.strip(), str(genomic_code).strip(), str(region), "hg38" if hg38 else "hg19", o, r]) + "\n")
# data = pd.DataFrame(data)
# data.columns = ["gene", "cdna code", "genomic code", "mutation_region", "original_codon", "mutation_code"]
# data.to_csv(output, index = False)
def extract_fastq_seq(fastq: str, chrom, start, end):
import pysam
with pysam.FastaFile(fastq) as fh:
rec = fh.fetch(str(chrom), start, end)
# print(rec)
return rec
def decode_mutation(rule: str, sequence):
original, replacement = "", ""
if ">" in rule:
original, replacement = rule.split(">")
original = sequence
elif rule == "dup":
original = sequence
replacement = original * 2
elif rule == "del":
original = sequence
replacement = ""
elif rule == "ins":
replacement = sequence
elif "delins" in rule:
original = sequence
replacement = rule.replace("delins", "")
elif "ins" in rule:
original = sequence
replacement = rule.replace("ins", "")
return original, replacement
def design_by_hmgd(data, fasta, outfile):
import re
res = pd.read_csv(data)
# print(res.head())
# hgvs
# chromosome
# startCoord
# endCoord
data = []
for idx, row in res.iterrows():
key = row["gene"] + "_" + str(idx)
try:
seq = extract_fastq_seq(fasta, int(row["chromosome"]), row["startCoord"] - 1,row["endCoord"])
seq, replace = decode_mutation(row["hgvs"], seq)
if not seq:
continue
replace = re.sub(r"[\d_]", "", replace)
if "del" in replace:
replace = ""
print(key, seq, replace)
before = extract_fastq_seq(fasta, int(row["chromosome"]), row["startCoord"] - 1 - 100, row["startCoord"])
after = extract_fastq_seq(fasta, int(row["chromosome"]), row["endCoord"], row["endCoord"] + 100)
seq = f"{before}({seq}/{replace}){after}"
data.append({"sequence_name": key, "editseq": seq})
except Exception:
continue
data = pd.DataFrame(data)
data.to_csv(outfile, index=False)
if __name__ == "__main__":
from fire import Fire
# get_aa_coords(
# "../metainfo/Cancer and blood disorder panels_v2.xlsx",
# "../gene_coords/batch2"
# )
# get_aa_coords(
# "../metainfo/DDR gene library in 2021 Cell.xlsx",
# "../gene_coords/positive"
# )
# # Fire({"aa": design_by_aa})
# design_by_aa(
# "../gene_coords/batch2",
# fasta="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
# output="../gene_aa_target_batch2.csv.gz"
# )
# design_by_aa(
# "../gene_coords/positive",
# fasta="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
# output="../gene_aa_target_positive.csv.gz",
# stop_codon = True
# )
# run_analysis(
# "../gene_aa_target_batch2.csv.gz",
# reference="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
# outdir="../../prediction/input/batch2"
# )
# run_analysis(
# "../gene_aa_target_positive.csv.gz",
# reference="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
# outdir="../../prediction/input/positive"
# )
# 生成snp结构snp_info是整理完的snp信息
# targets是记录了需要处理的基因
# design_by_snp(
# snp_info="../metainfo/副本FA家族基因-20250829-DJJ_XD.xlsx",
# targets="../metainfo/实验计划.xlsx",
# output="gene_snp_target.csv",
# fasta="../ref/gencode/GRCh37.p13.genome.fa.gz",
# fasta_hg38="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
# genes="../gene_coords"
# )
# design_by_hmgd(
# "../metainfo/allmut.csv",
# fasta="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
# outfile="../../prediction/input/pos_v2.csv.gz"
# )
# url = "https://www.ebi.ac.uk/proteins/api/coordinates?accession=P21359-1"
# download_uniprot_region("Test", "P21359")