提交饱和编辑的相关设计,及检验代码

This commit is contained in:
2026-02-26 14:02:42 +08:00
commit cb556b47c0
36 changed files with 5437 additions and 0 deletions

301
.gitignore vendored Normal file
View File

@@ -0,0 +1,301 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
.pnpm-debug.log*
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# Snowpack dependency directory (https://snowpack.dev/)
web_modules/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional stylelint cache
.stylelintcache
# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variable files
.env
.env.development.local
.env.test.local
.env.production.local
.env.local
# parcel-bundler cache (https://parceljs.org/)
.cache
.parcel-cache
# Next.js build output
.next
out
# Nuxt.js build / generate output
.nuxt
dist
# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and not Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public
# vuepress build output
.vuepress/dist
# vuepress v2.x temp and cache directory
.temp
.cache
# Docusaurus cache and generated files
.docusaurus
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# TernJS port file
.tern-port
# Stores VSCode versions used for testing VSCode extensions
.vscode-test
# yarn v2
.yarn/cache
.yarn/unplugged
.yarn/build-state.yml
.yarn/install-state.gz
.pnp.*
Footer
© 2022 GitHub, Inc.
Footer navigation
Terms
.idea/
*.png
*.pdf
Mus_musculus.GRCm38.101.gtf.gz*
hg38.chr19.gtf*
Homo_sapiens.GRCh38.101.sorted.gtf.gz*
SRX8994511.corrected_reads.bed.gz*
SRX8994511_sample.bed.gz*
tmp
new_run.sh
example.sorted.sorted.gtf
example.sorted.sorted.gtf.gz
example.sorted.sorted.gtf.gz.tbi
.vscode/
docs/_*
plots/
conda_build.py
run.sh
param.py
.DS_Store
ui/
*.rds
*.zip
example/
recipes/
AppDir/
appimage-build/
*_issue
.ruff_cache
*.csv

89
better_input_seq.py Normal file
View File

@@ -0,0 +1,89 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import csv
import os
import gzip
from glob import glob
from tqdm import tqdm
import pandas as pd
def load_left_aa(ref):
df = pd.read_csv(ref)
aa = set()
for _, row in df.iterrows():
gene, pos = row["gene"], row["aa_pos"]
aa.add(f"{gene}_AA{pos}")
return aa
def reader(path):
with gzip.open(path, "rt") as r:
dict_reader = csv.DictReader(r)
for row in tqdm(dict_reader):
yield row
def process_seq(sequence: str):
before, codon = sequence.split("(")
codon, after = codon.split(")")
src, dst = codon.split("/")
mismatch_codon = sum([x != y for x, y in zip(src, dst)])
if mismatch_codon == 1:
if src[:2] == dst[:2]:
before += src[:2]
return f"{before}({src[-1]}/{dst[-1]}){after}"
if src[1:] == dst[1:]:
after = src[1:] + after
return f"{before}({src[0]}/{dst[0]}){after}"
before += src[0]
after = src[-1] + after
return f"{before}({src[1]}/{dst[1]}){after}"
elif mismatch_codon == 2:
if src[0] == dst[0]:
before = before + src[0]
return f"{before}({src[1:]}/{dst[1:]}){after}"
if src[-1] == dst[-1]:
after = src[-1] + after
return f"{before}({src[:2]}/{dst[:2]}){after}"
return None
# return sequence
def main(ref, infile, outfile):
ref = load_left_aa(ref)
data = []
for file in glob(infile):
for row in reader(file):
seq_name = row["sequence_name"].split("_")[:2]
seq_name = "_".join(seq_name)
if seq_name in ref:
row["editseq"] = process_seq(row["editseq"])
if row["editseq"]:
row.pop("strategy")
row.pop("mutation_type")
data.append(row)
with gzip.open(outfile, "wt+") as w:
dict_writer = csv.DictWriter(w, fieldnames=data[0].keys())
dict_writer.writeheader()
# 写入数据行
dict_writer.writerows(data)
if __name__ == '__main__':
from fire import Fire
Fire(main)

View File

@@ -0,0 +1,222 @@
import csv
import gzip
import logging
from typing import Set, Dict, Iterator
# 配置日志记录
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def read_excluded_sequences(file_path: str, sequence_column: str = 'sequence_name') -> Set[str]:
"""
从CSV文件中读取需要排除的序列名称
Args:
file_path: CSV文件路径
sequence_column: 序列名称所在的列名
Returns:
排除序列名称的集合
"""
excluded = set()
try:
with gzip.open(file_path, 'rt', newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
# 检查必要的列是否存在
if sequence_column not in reader.fieldnames:
raise ValueError(f"CSV文件中缺少'{sequence_column}'")
# 逐行读取,收集序列名称
for row in reader:
sequence_name = row.get(sequence_column)
if sequence_name:
excluded.add(sequence_name.strip())
logger.info(f"{file_path} 读取了 {len(excluded)} 个排除序列")
return excluded
except FileNotFoundError:
logger.error(f"文件不存在: {file_path}")
raise
except Exception as e:
logger.error(f"读取文件 {file_path} 时出错: {e}")
raise
def validate_csv_headers(file_path: str, expected_headers: Set[str], gzipped: bool = True) -> bool:
"""
验证CSV文件是否包含必需的列头
Args:
file_path: 文件路径
expected_headers: 必需的列头集合
gzipped: 是否为gzip压缩文件
Returns:
验证是否通过
"""
try:
if gzipped:
opener = gzip.open
mode = 'rt'
else:
opener = open
mode = 'r'
with opener(file_path, mode, newline='', encoding='utf-8') as f:
# 读取第一行作为表头
reader = csv.reader(f)
headers = next(reader, None)
if not headers:
logger.error(f"文件 {file_path} 没有表头或为空")
return False
# 检查必需列是否存在
headers_set = set(headers)
missing_headers = expected_headers - headers_set
if missing_headers:
logger.error(f"文件 {file_path} 缺少必需列: {missing_headers}")
return False
logger.info(f"文件 {file_path} 表头验证通过")
return True
except Exception as e:
logger.error(f"验证文件 {file_path} 表头时出错: {e}")
return False
def process_prime_design(primedesign_path: str, excluded_sequences: Set[str],
output_path: str, batch_size: int = 10000) -> int:
"""
处理PrimeDesign文件过滤排除序列
Args:
primedesign_path: PrimeDesign文件路径gzip压缩
excluded_sequences: 需要排除的序列集合
output_path: 输出文件路径
batch_size: 批量写入大小
Returns:
处理的行数
"""
processed_count = 0
written_count = 0
try:
with gzip.open(primedesign_path, 'rt', newline='', encoding='utf-8') as input_file, \
gzip.open(output_path, 'wt', newline='', encoding='utf-8') as output_file:
# 创建CSV读写器
reader = csv.DictReader(input_file)
writer = csv.DictWriter(output_file, fieldnames=reader.fieldnames)
# 写入表头
writer.writeheader()
# 逐行处理数据
for row in reader:
processed_count += 1
try:
gRNA_type = row.get('gRNA_type', '').strip()
target_name = row.get('Target_name', '').strip()
# 只处理pegRNA且不在排除列表中
if gRNA_type == "pegRNA" and target_name not in excluded_sequences:
writer.writerow(row)
written_count += 1
# 定期刷新缓冲区
if written_count % batch_size == 0:
output_file.flush()
except KeyError as e:
logger.warning(f"{processed_count} 行缺少字段 {e},跳过该行")
continue
except Exception as e:
logger.warning(f"处理第 {processed_count} 行时出错: {e},跳过该行")
continue
# 最终刷新缓冲区
output_file.flush()
logger.info(f"处理完成: 处理了 {processed_count} 行,写入了 {written_count}")
return written_count
except Exception as e:
logger.error(f"处理PrimeDesign文件时出错: {e}")
raise
def main(pegrna: str, primedesign: str, output: str) -> None:
"""
主函数处理PrimeDesign输出文件过滤排除序列
Args:
pegrna: 包含需要排除的序列的CSV文件
primedesign: PrimeDesign输出文件gzip压缩
output: 输出文件前缀
"""
logger.info("开始处理PrimeDesign文件")
# 步骤1: 验证输入文件格式
logger.info("验证输入文件格式...")
# 验证pegrna文件格式
if not validate_csv_headers(pegrna, {'sequence_name'}, gzipped=True):
raise ValueError("pegrna文件格式验证失败")
# 验证primedesign文件格式
if not validate_csv_headers(primedesign, {'gRNA_type', 'Target_name'}, gzipped=True):
raise ValueError("primedesign文件格式验证失败")
# 步骤2: 读取排除序列
logger.info("读取需要排除的序列...")
excluded_sequences = read_excluded_sequences(pegrna)
# 步骤3: 处理PrimeDesign文件
logger.info("开始处理PrimeDesign文件...")
output_path = f"{output}_PrimeDesign_pegRNA.csv.gz"
written_count = process_prime_design(
primedesign_path=primedesign,
excluded_sequences=excluded_sequences,
output_path=output_path,
batch_size=10000
)
logger.info(f"输出文件已保存: {output_path}")
logger.info(f"总共写入了 {written_count} 条pegRNA记录")
def safe_main(pegrna: str, primedesign: str, output: str) -> None:
"""
带错误处理的主函数包装器
Args:
pegrna: 包含需要排除的序列的CSV文件
primedesign: PrimeDesign输出文件gzip压缩
output: 输出文件前缀
"""
try:
main(pegrna, primedesign, output)
logger.info("程序执行成功!")
except Exception as e:
logger.error(f"程序执行失败: {e}")
raise
# 如果直接运行此脚本
if __name__ == "__main__":
import sys
pegrna_file = sys.argv[1]
primedesign_file = sys.argv[2]
output_prefix = sys.argv[3]
safe_main(pegrna_file, primedesign_file, output_prefix)

450
design/main.py Normal file
View File

@@ -0,0 +1,450 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import math
import os
import re
import sys
from glob import glob
import requests as rq
import pandas as pd
from loguru import logger
import numpy as np
from src.mutation import design_mutations_for_orf
from src.reader import (extract_orf_sequence, get_cds_for_gene,
load_uniprot_region, read_gtf, Region)
from src.liftover import convert_interval
from src.snp import decode_snp, generate_sequences_with_combinations
import itertools
from src.editseq import run_analysis
# 清除默认的 handler
logger.remove()
# 添加一个只输出 INFO 及以上级别日志的 sink如控制台
# logger.add(level="INFO")
logger.add(
sys.stderr,
colorize=True,
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>",
# <cyan>{name}</cyan>: <cyan>{function}</cyan>: <cyan>{line}</cyan>
level="INFO"
)
def split_regions(cds):
u"""
切分原本的cds为3bp的氨基酸reigon
测试用例
14:103698801-103699017
14:103699133-103699179
14:103699364-103699576
14:103703173-103703327
14:103707003-103707215
14:103708522-103708659
14:103711033-103711087
"""
regions = []
cds = sorted(cds, key=lambda x: (x.chrom, x.start, x.end))
aa_codon_len = 3
start = 0
for x in cds:
# 如果start为0则直接从目前的区域开始
if start == 0:
start = x.start
elif start < 0:
# 如果start为负值说明上一个cds并不能完整划分为不同的aa
# 因此,需要单独将起始的小区域单独写出来
regions.append(Region(x.chrom, x.start, x.start - start, kind="start"))
regions[-1].addition = x
start = x.start - start
while start + aa_codon_len <= x.end:
# 记录下是否跨边界,以及跨的是哪一个边界
code = "regular"
if start == x.start:
code = "start"
elif start + aa_codon_len == x.end:
code = "end"
regions.append(Region(x.chrom, start, start + aa_codon_len, kind=code))
regions[-1].addition = x
start += aa_codon_len
if start < x.end:
# 如果是跨到end边界上了那么就记录跨的边界
regions.append(Region(x.chrom, start, x.end, kind="end"))
regions[-1].addition = x
start = start - x.end + 1
else:
# 如果没有则把start的指针归零
start = 0
return regions
def download_uniprot_region(protein, output):
resp = output.replace(".tsv", ".json")
url = f"https://www.ebi.ac.uk/proteins/api/coordinates?accession={protein}"
if os.path.exists(resp):
with open(resp, "r") as r:
resp = json.load(r)
else:
resp = rq.get(url, headers={"Accept": "application/json"})
resp = resp.json()
with open(output.replace(".tsv", ".json"), "w+") as w:
json.dump(resp, w, indent=4)
if not resp[0]["name"].endswith("HUMAN"):
raise ValueError(f"protein is not human")
__chroms__ = [str(x) for x in range(1, 23)] + ["chr" + str(x) for x in range(1, 23)] + ["X", "Y", "chrX", "chrY"]
with open(output, "w+") as w:
w.write(f"#{url}\n")
for coord in resp[0]["gnCoordinate"]:
chromosome = coord["genomicLocation"]["chromosome"]
if chromosome not in __chroms__:
continue
for row in coord["genomicLocation"]["exon"]:
genome = row["genomeLocation"]
genome = str(genome["begin"]["position"]) + "-" + str(genome["end"]["position"])
protein = row["proteinLocation"]
if "end" not in protein and "position" in protein:
protein = [str(protein["position"]["position"]), "-", str(protein["position"]["position"])]
else:
protein = [str(protein["begin"]["position"]), "-", str(protein["end"]["position"])]
row = f"{chromosome}:{genome}\t{'\t'.join(protein)}"
w.write(row + "\n")
break
def get_aa_coords(genes, output):
os.makedirs(output, exist_ok=True)
df = pd.read_excel(genes)
# df = df.loc[df["Batch"] == 1, :]
for _, row in df.iterrows():
gene_name = row[1]
url = f"https://rest.uniprot.org/uniprotkb/search?query=gene:{gene_name}+AND+organism_id:9606+AND+reviewed:true&format=json"
resp = rq.get(url)
for row in resp.json().get("results", []):
if "HUMAN" in row["uniProtkbId"]:
priority = row["primaryAccession"]
download_uniprot_region(priority, os.path.join(output, f"{gene_name}_{priority}.tsv"))
break
def adjust_cross_border_region(row):
start = row.start
end = row.end
if len(row) < 3 and "cross" in row.kind:
if row.kind == "cross_start":
start = end - 3
else:
end = start + 3
return f"{row.chrom}:{start}-{end}"
return str(row)
def design_by_aa(genes, fasta, output, stop_codon = False):
u""" 根据氨基酸设计错配的位点和错配规则 """
df = []
for gene in glob(os.path.join(genes, "*.tsv")):
logger.info(f"开始设计突变 {gene}...")
key = os.path.basename(gene).split(".")[0]
# 读取已有的区域
cds = load_uniprot_region(gene)
# 按照氨基酸位置划分
cds = split_regions(cds)
if not cds:
continue
# 提取序列
cds = extract_orf_sequence(fasta, cds, half_open=True)
for idx, x in enumerate(cds):
for strategy in ["3N"]:
results = design_mutations_for_orf(x.sequence, strategy=strategy)
for res in results:
for var in res["variants"]:
if var == res["original_codon"]:
continue
# 如果1个bp在起点则说明2bp在前边end该位点的前2bp必须与记录的内含子相同
if "cross_start" == x.kind and len(x) == 1 and var[:2] != x.sequence[:2]:
continue
# 如果2bp在起点则说明1bp在前边end则该位点的第一个碱基必须为内含子相同位点
elif "cross_start" == x.kind and len(x) == 2 and var[0] != x.sequence[0]:
continue
# 如果1bp在end则说明2bp在后边则该位点的2bp位点必须与内含子相同
elif "cross_end" == x.kind and len(x) == 1 and var[1:] != x.sequence[1:]:
continue
# 如果1bp在end则说明1bp在后边则该位点的1bp位点必须为C或G
elif "cross_end" == x.kind and len(x) == 2 and var[-1] not in ["C", "G"]:
continue
row = [key, str(x.addition), idx+1, str(x), adjust_cross_border_region(x), x.kind, strategy, res["original_codon"], var]
df.append(row)
df = pd.DataFrame(df)
df.columns = ["gene", "cds_region", "aa_index", "aa_region", "region_with_intron", "cross_cds_border", "strategy",
"origial_code", "mutation_code"]
strategy = []
for _, row in df.iterrows():
match = np.sum([x == y for x, y in zip(row["origial_code"], row["mutation_code"])])
strategy.append(f"{3-match}N")
df["strategy"] = strategy
if stop_codon:
df = df[df["mutation_code"].isin(["TAA", "TAG", "TGA"])]
df.to_csv(output, index = False)
def design_by_snp(snp_info, targets, genes, fasta, fasta_hg38, output):
logger.info("读取染色体")
chroms = {}
starts = {}
for gene in glob(os.path.join(genes, "*.tsv")):
key = os.path.basename(gene).split(".")[0]
cds = load_uniprot_region(gene)
cds = sorted(cds, key=lambda x:[x.chrom, x.start, x.end])
chroms[key] = cds[0].chrom
starts[key] = cds[0].start
logger.info(f"读取snp的信息{snp_info}")
all_sheets = pd.read_excel(snp_info, sheet_name=None)
# 遍历所有工作表
res = {}
for sheet_name, df in all_sheets.items():
temp = {}
for _, row in df.iterrows():
cdna = row["DNA change (cDNA) "]
hg38 = row["DNA change (genomic) (hg19)     "]
temp[cdna] = hg38
for sheet in re.split(r"[\(\s\)]", sheet_name):
res[sheet] = temp
print(res.keys())
logger.info(f"读取目标:{targets}")
df = pd.read_excel(targets, sheet_name=2)
with open(output, "w+") as w:
w.write(",".join(["gene", "cdna code", "genomic code", "mutation_region", "version", "original_codon", "mutation_code"]) + "\n")
for column in df.columns:
if "Unnamed" in column:
continue
for code in df[column]:
if not isinstance(code, str) and math.isnan(code):
continue
genomic_code = res.get(column, {}).get(code)
if genomic_code:
sites, rule = decode_snp(genomic_code)
elif str(code).startswith("c."):
sites, rule = decode_snp(code, ref_start=starts["FANCD2" if column == "FAND2" else column])
else:
continue
region = Region(chroms["FANCD2" if column == "FAND2" else column], start=sites[0], end=sites[-1])
hg38 = False
if genomic_code:
region = extract_orf_sequence(fasta, [region])[0]
elif str(code).startswith("c."):
hg38 = True
region = extract_orf_sequence(fasta_hg38, [region])[0]
original, replacement = "", ""
if ">" in rule:
original, replacement = rule.split(">")
original = region.sequence
elif rule == "dup":
original = region.sequence
replacement = original * 2
elif rule == "del":
original = region.sequence
replacement = ""
elif rule == "ins":
replacement = region.sequence
elif "delins" in rule:
original = region.sequence
replacement = rule.replace("delins", "")
elif "ins" in rule:
original = region.sequence
replacement = rule.replace("ins", "")
if not genomic_code:
genomic_code = ""
# 序列中所有N替换后的排列组合
for o, r in itertools.product(generate_sequences_with_combinations(original), generate_sequences_with_combinations(replacement)):
w.write(",".join([column, code.strip(), str(genomic_code).strip(), str(region), "hg38" if hg38 else "hg19", o, r]) + "\n")
# data = pd.DataFrame(data)
# data.columns = ["gene", "cdna code", "genomic code", "mutation_region", "original_codon", "mutation_code"]
# data.to_csv(output, index = False)
def extract_fastq_seq(fastq: str, chrom, start, end):
import pysam
with pysam.FastaFile(fastq) as fh:
rec = fh.fetch(str(chrom), start, end)
# print(rec)
return rec
def decode_mutation(rule: str, sequence):
original, replacement = "", ""
if ">" in rule:
original, replacement = rule.split(">")
original = sequence
elif rule == "dup":
original = sequence
replacement = original * 2
elif rule == "del":
original = sequence
replacement = ""
elif rule == "ins":
replacement = sequence
elif "delins" in rule:
original = sequence
replacement = rule.replace("delins", "")
elif "ins" in rule:
original = sequence
replacement = rule.replace("ins", "")
return original, replacement
def design_by_hmgd(data, fasta, outfile):
import re
res = pd.read_csv(data)
# print(res.head())
# hgvs
# chromosome
# startCoord
# endCoord
data = []
for idx, row in res.iterrows():
key = row["gene"] + "_" + str(idx)
try:
seq = extract_fastq_seq(fasta, int(row["chromosome"]), row["startCoord"] - 1,row["endCoord"])
seq, replace = decode_mutation(row["hgvs"], seq)
if not seq:
continue
replace = re.sub(r"[\d_]", "", replace)
if "del" in replace:
replace = ""
print(key, seq, replace)
before = extract_fastq_seq(fasta, int(row["chromosome"]), row["startCoord"] - 1 - 100, row["startCoord"])
after = extract_fastq_seq(fasta, int(row["chromosome"]), row["endCoord"], row["endCoord"] + 100)
seq = f"{before}({seq}/{replace}){after}"
data.append({"sequence_name": key, "editseq": seq})
except Exception:
continue
data = pd.DataFrame(data)
data.to_csv(outfile, index=False)
if __name__ == "__main__":
from fire import Fire
# get_aa_coords(
# "../metainfo/Cancer and blood disorder panels_v2.xlsx",
# "../gene_coords/batch2"
# )
# get_aa_coords(
# "../metainfo/DDR gene library in 2021 Cell.xlsx",
# "../gene_coords/positive"
# )
# # Fire({"aa": design_by_aa})
# design_by_aa(
# "../gene_coords/batch2",
# fasta="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
# output="../gene_aa_target_batch2.csv.gz"
# )
# design_by_aa(
# "../gene_coords/positive",
# fasta="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
# output="../gene_aa_target_positive.csv.gz",
# stop_codon = True
# )
# run_analysis(
# "../gene_aa_target_batch2.csv.gz",
# reference="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
# outdir="../../prediction/input/batch2"
# )
# run_analysis(
# "../gene_aa_target_positive.csv.gz",
# reference="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
# outdir="../../prediction/input/positive"
# )
# 生成snp结构snp_info是整理完的snp信息
# targets是记录了需要处理的基因
# design_by_snp(
# snp_info="../metainfo/副本FA家族基因-20250829-DJJ_XD.xlsx",
# targets="../metainfo/实验计划.xlsx",
# output="gene_snp_target.csv",
# fasta="../ref/gencode/GRCh37.p13.genome.fa.gz",
# fasta_hg38="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
# genes="../gene_coords"
# )
# design_by_hmgd(
# "../metainfo/allmut.csv",
# fasta="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
# outfile="../../prediction/input/pos_v2.csv.gz"
# )
# url = "https://www.ebi.ac.uk/proteins/api/coordinates?accession=P21359-1"
# download_uniprot_region("Test", "P21359")

16
design/pyproject.toml Normal file
View File

@@ -0,0 +1,16 @@
[project]
name = "pgrna"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"biopython>=1.85",
"fire>=0.7.1",
"loguru>=0.7.3",
"openpyxl>=3.1.5",
"pandas>=2.3.3",
"pyfaidx>=0.9.0.3",
"pyliftover>=0.4.1",
"rich>=14.2.0",
]

233
design/src/editseq.py Normal file
View File

@@ -0,0 +1,233 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
created by lanzl
modified by zym
"""
import os
import pandas as pd
import re
import sys
from pyfaidx import Fasta, FetchError
HG19_FASTA_PATH = "/rawdata1/project/peRNA_design/ref/gencode/GRCh37.p13.genome.fa.gz"
HG38_FASTA_PATH = "/rawdata1/project/peRNA_design/ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
def parse_region(region_str: str) -> tuple:
"""解析 'chrom:start-end' 格式的基因组区域,并确保染色体名带有 'chr' 前缀。"""
match = re.match(r"(\w+):(\d+)-(\d+)", region_str)
chrom, start, end = match.groups()
if not chrom.lower().startswith("chr"):
chrom = "chr" + chrom
return chrom, int(start), int(end)
def extract_orf_sequence(genome: Fasta, chrom: str, start: int, end: int) -> str:
"""
从预加载的 Fasta 对象中提取序列1-based inclusive
"""
try:
sequence = str(genome.get_seq(chrom, start, end))
return sequence.upper()
except (KeyError, FetchError) as e:
alt_chrom = chrom
if chrom.lower().startswith("chr"):
alt_chrom = chrom[3:]
if alt_chrom != chrom:
try:
sequence = str(genome.get_seq(alt_chrom, start, end))
return sequence.upper()
except (KeyError, FetchError) as inner_e:
raise FetchError(
f"Requested rname '{chrom}' (also tried '{alt_chrom}') does not exist in FASTA index."
) from inner_e
raise e
def generate_editseq(
original: str,
replacement: str,
region_str: str,
genome: Fasta,
flank_size: int = 100,
) -> str:
"""构造 EditSeq 序列字符串(突变位点 + 100bp 侧翼)。"""
chrom, mut_start, mut_end = parse_region(region_str)
# 计算侧翼坐标
upstream_start = mut_start - flank_size
upstream_end = mut_start - 1
downstream_start = mut_end + 1
downstream_end = mut_end + flank_size
# 提取侧翼序列
upstream_flank = extract_orf_sequence(genome, chrom, upstream_start, upstream_end)
downstream_flank = extract_orf_sequence(
genome, chrom, downstream_start, downstream_end
)
original = str(original).strip()
replacement = str(replacement).strip()
# --- 突变逻辑:所有替换(等长或不等长)统一使用 (ORIGINAL/REPLACEMENT) 格式 ---
mut_part = ""
if original and replacement:
# 替换或 Delins: (ORIGINAL/REPLACEMENT)
mut_part = f"({original}/{replacement})"
elif original:
# 删除: (-ORIGINAL)
mut_part = f"(-{original})"
elif replacement:
# 插入: (+REPLACEMENT)
mut_part = f"(+{replacement})"
else:
mut_part = "(Invalid mutation logic)"
return f"{upstream_flank}{mut_part}{downstream_flank}"
# --- 氨基酸突变处理 ---
def process_aa_mutations(df_aa: pd.DataFrame, genome_hg38: Fasta) -> pd.DataFrame:
"""处理氨基酸AA饱和诱变数据并返回包含 EditSeq, strategy 和 mutation_type 的 DataFrame。"""
results = []
# 性能优化:使用 to_dict('records') 替代 iterrows()
for row in df_aa.to_dict("records"):
original = (
str(row["origial_code"]).strip() if pd.notna(row["origial_code"]) else ""
)
replacement = (
str(row["mutation_code"]).strip() if pd.notna(row["mutation_code"]) else ""
)
# 确定突变类型
if original and replacement:
mut_type = "REPL"
elif original and not replacement:
mut_type = "DEL"
elif not original and replacement:
mut_type = "INS"
else:
mut_type = "UNKNOWN"
# 生成序列名称
seq_name = f"{row['gene']}_AA{row['aa_index']}_{row['origial_code']}_{row['mutation_code']}"
# 生成 EditSeq
editseq = generate_editseq(
original=row["origial_code"],
replacement=row["mutation_code"],
region_str=row["aa_region"],
genome=genome_hg38,
)
# 收集结果,包括 'strategy' 和 'mutation_type' 列
results.append(
{
"sequence_name": seq_name,
"editseq": editseq,
"strategy": row["strategy"],
"mutation_type": mut_type,
}
)
return pd.DataFrame(results)
# --- SNP/cDNA 突变处理 ---
def process_snp_mutations(
df_snp: pd.DataFrame, genome_hg19: Fasta, genome_hg38: Fasta
) -> pd.DataFrame:
"""处理 SNP/cDNA 突变数据,返回包含 EditSeq 和 mutation_type 的 DataFrame。"""
results = []
# 性能优化:使用 to_dict('records') 替代 iterrows()
for row in df_snp.to_dict("records"):
original = (
str(row["original_codon"]).strip()
if pd.notna(row["original_codon"])
else ""
)
replacement = (
str(row["mutation_code"]).strip() if pd.notna(row["mutation_code"]) else ""
)
# 根据版本选择合适的 Fasta 对象
version = str(row["version"]).lower()
genome_to_use = genome_hg38 if version == "hg38" else genome_hg19
# 确定用于序列命名的突变类型
if original and replacement:
mut_type = "REPL"
elif original and not replacement:
mut_type = "DEL"
elif not original and replacement:
mut_type = "INS"
else:
mut_type = "UNKNOWN"
# 构造序列名称
cdna_code_clean = str(row["cdna code"]).replace(".", "").replace("_", "p")
seq_name = f"{row['gene']}_{mut_type}_{cdna_code_clean}"
# 生成 EditSeq
editseq = generate_editseq(
original=original,
replacement=replacement,
region_str=str(row["mutation_region"]),
genome=genome_to_use,
)
results.append(
{"sequence_name": seq_name, "editseq": editseq, "mutation_type": mut_type}
)
return pd.DataFrame(results)
def run_analysis(infile, reference, outdir):
# AA_INPUT_FILE = "/rawdata1/project/peRNA_design/gene_aa_target.csv"
# SNP_INPUT_FILE = "/rawdata1/project/peRNA_design/gene_snp_target.csv"
# genome_hg19 = Fasta(HG19_FASTA_PATH)
# genome_hg38 = Fasta(HG38_FASTA_PATH)
genome = Fasta(reference)
os.makedirs(outdir, exist_ok=True)
aa_df_input = pd.read_csv(infile)
# snp_df_input = pd.read_csv(SNP_INPUT_FILE)
# --- 阶段一: 处理 SNP/cDNA 突变并输出 ---
# snp_df = process_snp_mutations(snp_df_input, genome_hg19, genome_hg38)
# snp_output_file = "snp_editseq_output.csv"
# snp_df.to_csv(snp_output_file, index=False)
# --- 阶段二: 按 strategy 分组处理 AA 突变并输出 ---
aa_df_input["strategy"] = aa_df_input["strategy"].str.upper()
strategies = aa_df_input["strategy"].unique()
for strategy in strategies:
# 跳过空的 strategy
if pd.isna(strategy):
continue
# 过滤 DataFrame
aa_subset_df = aa_df_input[aa_df_input["strategy"] == strategy].copy()
if aa_subset_df.empty:
continue
# 处理子集
aa_df_processed = process_aa_mutations(aa_subset_df, genome)
# 保存
aa_output_file = f"aa_{strategy}_editseq_output.csv"
aa_df_processed.to_csv(os.path.join(outdir, aa_output_file), index=False)
if __name__ == "__main__":
run_analysis()

53
design/src/liftover.py Normal file
View File

@@ -0,0 +1,53 @@
import pandas as pd
from pyliftover import LiftOver
# lo = LiftOver('/home/zym/projects/pgRNA/liftover/hg19ToHg38.over.chain.gz')
from pyliftover import LiftOver
# 创建从 hg19 到 hg38 的转换器
lo = LiftOver("hg19", "hg38")
def convert_interval(chrom, start, end):
"""
将区间 (start, end) 从源基因组转换到目标基因组
返回: (new_chrom, new_start, new_end) 或 None
"""
# 注意pyliftover 使用 1-based 坐标
# 如果你的 BED 是 0-based如 BED 文件start 需要 +1
# 这里假设输入是 1-based如果是 0-based请用 start+1, end
result_start = lo.convert_coordinate(chrom, start)
result_end = lo.convert_coordinate(chrom, end)
if not result_start or not result_end:
return None # 无法转换
# 取置信度最高的映射
best_start = max(result_start, key=lambda x: x[3])
best_end = max(result_end, key=lambda x: x[3])
new_chrom = best_start[0]
new_start = best_start[1]
new_end = best_end[1]
# 确保 start <= end
if new_start >= new_end:
new_start, new_end = new_end, new_start
return new_chrom, new_start, new_end
def get_seq(path, coord):
import pysam
# 打开 FASTA 文件
fasta = pysam.FastaFile(path) # 自动读取 genome.fa.fai
# 提取序列chr1:1000000-10001000-based, [start, end)
# 注意pysam 使用 0-based 坐标,与 BED 一致
return fasta.fetch(region=f"{coord[0]}:{coord[1]}-{coord[2]}")
if __name__ == "__main__":
pass

216
design/src/mutation.py Normal file
View File

@@ -0,0 +1,216 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import itertools
from Bio.Seq import Seq
from loguru import logger
# 遗传密码表DNA -> 氨基酸
codon_table = {
"TTT": "F",
"TTC": "F",
"TTA": "L",
"TTG": "L",
"TCT": "S",
"TCC": "S",
"TCA": "S",
"TCG": "S",
"TAT": "Y",
"TAC": "Y",
"TAA": "*",
"TAG": "*",
"TGT": "C",
"TGC": "C",
"TGA": "*",
"TGG": "W",
"CTT": "L",
"CTC": "L",
"CTA": "L",
"CTG": "L",
"CCT": "P",
"CCC": "P",
"CCA": "P",
"CCG": "P",
"CAT": "H",
"CAC": "H",
"CAA": "Q",
"CAG": "Q",
"CGT": "R",
"CGC": "R",
"CGA": "R",
"CGG": "R",
"ATT": "I",
"ATC": "I",
"ATA": "I",
"ATG": "M",
"ACT": "T",
"ACC": "T",
"ACA": "T",
"ACG": "T",
"AAT": "N",
"AAC": "N",
"AAA": "K",
"AAG": "K",
"AGT": "S",
"AGC": "S",
"AGA": "R",
"AGG": "R",
"GTT": "V",
"GTC": "V",
"GTA": "V",
"GTG": "V",
"GCT": "A",
"GCC": "A",
"GCA": "A",
"GCG": "A",
"GAT": "D",
"GAC": "D",
"GAA": "E",
"GAG": "E",
"GGT": "G",
"GGC": "G",
"GGA": "G",
"GGG": "G",
}
# 反向查找:氨基酸 -> 密码子列表
aa_to_codons = {}
for codon, aa in codon_table.items():
if aa not in aa_to_codons:
aa_to_codons[aa] = []
aa_to_codons[aa].append(codon)
# 碱基
bases = ["A", "T", "G", "C"]
def generate_nnn():
"""生成所有 NNN 组合64种"""
return ["".join(c) for c in itertools.product(bases, repeat=3)]
def generate_2n_variants(original_codon, fixed_position=None):
"""
生成双N突变两个位置随机一个固定
fixed_position: 0,1,2 表示哪个位置保持不变0=第一个碱基)
若不指定则生成所有三种模式NNT, NTN, TNN
"""
variants = set()
for pos in [fixed_position] if fixed_position is not None else [0, 1, 2]:
fixed_base = original_codon[pos]
for b1 in bases:
for b2 in bases:
codon_list = ["_", "_", "_"]
codon_list[pos] = fixed_base
idx = 0
for i in range(3):
if i != pos:
codon_list[i] = [b1, b2][idx]
idx += 1
variant = "".join(codon_list)
variants.add(variant)
return sorted(variants)
def generate_1n_variants(original_codon, fixed_positions=None):
"""
生成单N突变一个位置随机两个固定
fixed_positions: 如 [0,1] 表示第0和第1位固定
若不指定则生成所有三种模式ANT, ATN, TAN
"""
variants = set()
if fixed_positions:
positions = [fixed_positions]
else:
positions = [[0, 1], [0, 2], [1, 2]]
for fix in positions:
var_pos = 3 - sum(fix) # 剩下那个位置是变量
for i in range(3):
if i not in fix:
var_pos = i
break
base1, base2 = original_codon[fix[0]], original_codon[fix[1]]
for b in bases:
codon_list = ["_", "_", "_"]
codon_list[fix[0]] = base1
codon_list[fix[1]] = base2
codon_list[var_pos] = b
variant = "".join(codon_list)
variants.add(variant)
return sorted(variants)
def translate(codon):
return codon_table.get(codon, "X")
def design_mutations_for_orf(dna_seq, strategy="3N"):
"""
对整个 ORF 序列进行饱和突变设计
strategy: '3N', '2N', '1N'
"""
if len(dna_seq) % 3 != 0:
raise ValueError(f"ORF 长度必须是 3 的倍数!{dna_seq}")
num_codons = len(dna_seq) // 3
results = []
for i in range(num_codons):
start = i * 3
end = start + 3
orig_codon = dna_seq[start:end]
orig_aa = translate(orig_codon)
logger.debug(
f"\n--- 位点 {i + 1} (氨基酸 {i + 1}): {orig_aa} ({orig_codon}) ---"
)
variants = []
if strategy == "3N":
variants = generate_nnn()
logger.debug(f"策略: 3N (NNN) → 共 {len(variants)} 种组合")
elif strategy == "2N":
variants = generate_2n_variants(orig_codon)
logger.debug(f"策略: 2N (任意两个随机) → 共 {len(variants)} 种组合")
elif strategy == "1N":
variants = generate_1n_variants(orig_codon)
logger.debug(f"策略: 1N (任意一个随机) → 共 {len(variants)} 种组合")
else:
raise ValueError("strategy 必须是 '3N', '2N', 或 '1N'")
# 过滤掉无效密码子(理论上不会)
valid_variants = [v for v in variants if len(v) == 3]
# 统计突变结果
mutant_aa_count = {}
stop_count = 0
for v in valid_variants:
aa = translate(v)
if aa == "*":
stop_count += 1
mutant_aa_count[aa] = mutant_aa_count.get(aa, 0) + 1
logger.debug(f"→ 共产生 {len(valid_variants)} 个有效突变")
logger.debug(f"→ 可产生 {len(mutant_aa_count)} 种不同氨基酸(含终止)")
logger.debug(f"→ 引入终止密码子: {stop_count}")
logger.debug(f"→ 氨基酸分布: {mutant_aa_count}")
results.append(
{
"position": i + 1,
"original_codon": orig_codon,
"original_aa": orig_aa,
"variants": valid_variants,
"variant_count": len(valid_variants),
"mutant_aa_count": mutant_aa_count,
"stop_count": stop_count,
}
)
return results
if __name__ == "__main__":
pass

236
design/src/reader.py Normal file
View File

@@ -0,0 +1,236 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
从gtf中读取CDS并读取对应的sequence
"""
import gzip
import pandas as pd
from Bio.Seq import Seq
from loguru import logger
from pyfaidx import Fasta
class Region(object):
"""记录坐标位点,用于融合"""
def __init__(self, chrom, start, end, strand="+", kind=None):
self.chrom = chrom
self.start = start
self.end = end
self.strand = strand
self.sequence = None
self.kind_ = kind
self.addition = None
@classmethod
def create(cls, region):
chrom, sites = region.split(":")
sites = [int(x) for x in sites.split("-")]
return cls(chrom, sites[0], sites[-1], "+")
def set_seq(self, sequence: str):
self.sequence = sequence.upper()
# 如果是负链,需要反向互补
if self.strand == "-":
self.sequence = str(Seq(sequence).reverse_complement())
def __and__(self, other):
if self.chrom != other.chrom:
return False
return self.start < other.end and self.end > other.start
def __add__(self, other):
if not self & other:
raise ValueError("没有重合位点")
self.start = min(self.start, other.start)
self.end = max(self.end, other.end)
return self
def __str__(self) -> str:
return f"{self.chrom}:{self.start}-{self.end}"
def __hash__(self):
return hash(str(self))
def __len__(self):
return self.end - self.start
@property
def kind(self):
if len(self) >= 3:
return self.kind_
if not self.kind_:
return ""
else:
return f"cross_{self.kind_}"
def read_gtf(gtf_path):
"""
读取 GTF 文件,返回 DataFrame
"""
logger.info("正在读取 GTF 文件...")
columns = [
"seqname",
"source",
"feature",
"start",
"end",
"score",
"strand",
"frame",
"attribute",
]
df = pd.read_csv(
gtf_path, sep="\t", comment="#", header=None, names=columns, low_memory=False
)
# 过滤出 CDS 行
cds_df = df[df["feature"] == "CDS"].copy()
# 解析 attribute 列,展开为多个列
# 使用 pd.json_normalize 将字典列表转换为 DataFrame
try:
attributes_df = pd.json_normalize(cds_df["attribute"].apply(parse_attributes))
except Exception as e:
logger.error(f"解析 attribute 字段失败: {e}")
raise
# 将原始列与解析后的属性列合并
result_df = pd.concat([cds_df.reset_index(drop=True), attributes_df], axis=1)
logger.info(f"成功读取并解析 GTF 文件,共 {len(result_df)} 个 CDS 特征。")
return result_df
def parse_attributes(attr_str):
"""
解析 GTF 的 attribute 字段,返回字典
"""
attributes = {}
for item in attr_str.split(";"):
item = item.strip()
if not item:
continue
if " " in item:
key, value = item.split(" ", 1)
attributes[key] = value.strip('"')
return attributes
def get_cds_for_gene(cds_df, gene_name):
"""
提取指定基因的所有 CDS 条目,并按转录本分组,选择最长的转录本
"""
logger.info(f"正在查找基因 '{gene_name}' 的 CDS...")
# 添加解析后的属性
cds_df["attributes_parsed"] = cds_df["attribute"].apply(parse_attributes)
# 筛选包含该基因名的行
gene_cds_list = []
for idx, row in cds_df.iterrows():
attrs = row["attributes_parsed"]
if attrs.get("transcript_id") == gene_name:
# if attrs.get('gene_name') == gene_name or attrs.get('gene_id').startswith(gene_name):
gene_cds_list.append(row)
if not gene_cds_list:
raise ValueError(f"未在 GTF 中找到基因 '{gene_name}'")
df = pd.DataFrame(gene_cds_list)
df = df[
["seqname", "feature", "start", "end", "strand", "transcript_id"]
].drop_duplicates()
res = []
last = None
for _, row in df.iterrows():
temp = Region(
str(row["seqname"]),
row["start"],
row["end"],
str(row["strand"]),
row["transcript_id"],
)
if last is None:
last = temp
elif temp & last:
last = last + temp
else:
res.append(last)
last = temp
if last not in res:
res.append(last)
return res
def load_uniprot_region(path):
res = []
last = None
with open(path) as r:
for line in r:
if line.startswith("#"):
continue
temp = Region.create(line.split()[0])
if last is None:
last = temp
elif temp & last:
last = last + temp
else:
res.append(last)
last = temp
if last not in res:
res.append(last)
return res
def extract_orf_sequence(genome_fasta, cds_rows, half_open=False):
"""
从参考基因组中提取 CDS 并拼接成 ORF
"""
if not cds_rows:
raise ValueError("not cds")
seqname = cds_rows[0].chrom
strand = cds_rows[0].strand
logger.debug(f"从参考基因组提取序列 (chr{seqname})...")
genome = Fasta(genome_fasta)
# 获取染色体序列
try:
chrom_seq = genome[seqname] # 如 "chr1", "1" 等,根据 FASTA 命名调整
except KeyError:
if "chr" in seqname:
seqname = seqname.replace("chr", "")
else:
seqname = "chr" + seqname
chrom_seq = genome[seqname] # 如 "chr1", "1" 等,根据 FASTA 命名调整
for row in cds_rows:
start = int(row.start) - 1 # GTF 是 1-basedpyfaidx 是 0-based
end = int(row.end) - (1 if half_open else 0)
if len(row) < 3 and "cross" in row.kind:
if row.kind == "cross_start":
start = end - 3
else:
end = start + 3
row.set_seq(chrom_seq[start:end].seq)
return cds_rows
if __name__ == "__main__":
pass

139
design/src/safe_target.py Normal file
View File

@@ -0,0 +1,139 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
生成safe targeting 序列
"""
import pandas as pd
import random
import pysam
from tqdm import tqdm
seed = 42
random.seed(42)
__AAs__ = {
"丙氨酸": ["GCU", "GCC", "GCA", "GCG"],
"精氨酸": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"],
"天冬酰胺": ["AAU", "AAC"],
"天冬氨酸": ["GAU", "GAC"],
"半胱氨酸": ["UGU", "UGC"],
"谷氨酰胺": ["CAA", "CAG"],
"谷氨酸": ["GAA", "GAG"],
"甘氨酸": ["GGU", "GGC", "GGA", "GGG"],
"组氨酸": ["CAU", "CAC"],
"异亮氨酸": ["AUU", "AUC", "AUA"],
"亮氨酸": ["UUA", "UUG", "CUU", "CUC", "CUA", "CUG"],
"赖氨酸": ["AAA", "AAG"],
"甲硫氨酸": ["AUG"],
"苯丙氨酸": ["UUU", "UUC"],
"脯氨酸": ["CCU", "CCC", "CCA", "CCG"],
"丝氨酸": ["UCU", "UCC", "UCA", "UCG", "AGU", "AGC"],
"苏氨酸": ["ACU", "ACC", "ACA", "ACG"],
"色氨酸": ["UGG"],
"酪氨酸": ["UAU", "UAC"],
"缬氨酸": ["GUU", "GUC", "GUA", "GUG"],
"终止密码子": ["UAA", "UAG", "UGA"],
}
def codons():
for key, values in __AAs__.items():
yield key, [value.replace("U", "T") for value in values]
class Region:
"""Represents a 3bp codon region in the full sequence."""
def __init__(self, chrom: str, start: int, end: int):
self.chrom = chrom
self.start = start
self.end = end
self.__shift__ = 0
def __str__(self):
return f"{self.chrom}:{self.start}-{self.end}"
def shift(self, fasta: str):
for i in range(0, self.end - self.start):
if self.__shift__ != 0:
break
seq = extract_fastq_seq(fasta, Region(self.chrom, self.start+i, self.start+i+3))
for _, values in codons():
if seq in values:
self.__shift__ = i
break
def choose(self, number: int = 3):
length_of_codon = 3
regions = []
for i in range(self.start + self.__shift__, self.end, length_of_codon):
if i + length_of_codon > self.end:
break
regions.append([i, i + length_of_codon])
# np.choice(my_list, size=3, replace=False)
if number > len(regions):
return [Region(self.chrom, x[0], x[1]) for x in regions]
return [Region(self.chrom, x[0], x[1]) for x in random.sample(regions, number)]
def extract_fastq_seq(fastq: str, region: Region, seq_len: int = 100):
with pysam.FastaFile(fastq) as fh:
rec = fh.fetch(region.chrom, region.start, region.end)
# print(rec)
return rec
def mutation(seq: str):
random.seed(seed)
for key, value in __AAs__.items():
if seq in value:
random_keys = random.sample([x for x in __AAs__.keys() if x != key], 1)[0]
return random.sample(__AAs__[random_keys], 1)[0].replace("U", "T")
def main(infile, outfile, reference = "../ref/UCSC/hg19.fa.gz", seq_len: int = 100):
meta = pd.read_excel(infile, sheet_name="Human Safe Regions", header=None)
meta = meta.sample(n=2000, random_state=seed)
data = []
for idx in tqdm(meta.iloc[:, 0], total=meta.shape[0]):
idx = idx.split(";")
region = Region(idx[0], int(idx[1]), int(idx[2]))
region.shift(reference)
regions = region.choose(5)
for reg in regions:
seq = extract_fastq_seq(reference, reg)
mut = mutation(seq)
if seq is None or mut is None:
continue
key = str(reg) + "_" + seq + "_" + mut
before = extract_fastq_seq(reference, Region(region.chrom, reg.start - seq_len, reg.start))
after = extract_fastq_seq(reference, Region(region.chrom, reg.end, reg.end + seq_len))
seq = f"{before}({seq}/{mut}){after}"
data.append({"sequence_name": key, "editseq": seq})
data = pd.DataFrame(data)
data.to_csv(outfile, index=False)
if __name__ == "__main__":
from fire import Fire
Fire(main)

114
design/src/snp.py Normal file
View File

@@ -0,0 +1,114 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""用来解析snp错配信息"""
import re
from itertools import product
def generate_sequences_with_combinations(seq):
"""
将 DNA 序列中连续的 N 替换为所有可能的 A/T/C/G 组合,
返回所有可能的序列列表。
参数:
seq (str): 输入的 DNA 序列,可包含 N
返回:
list: 所有可能的序列(字符串列表)
"""
if "N" not in seq:
return [seq]
# 分割序列,保留分隔符信息
segments = []
i = 0
while i < len(seq):
if seq[i] == "N":
j = i
while j < len(seq) and seq[j] == "N":
j += 1
length = j - i
segments.append(("N", length)) # ('N', 3) 表示连续3个N
i = j
else:
j = i
while j < len(seq) and seq[j] != "N":
j += 1
segments.append(("seq", seq[i:j]))
i = j
# 提取每个 N 块的可能组合
n_block_options = []
for seg_type, content in segments:
if seg_type == "N":
# 生成所有长度为 content 的 ATCG 组合
options = ["".join(p) for p in product("ATCG", repeat=content)]
n_block_options.append(options)
# 如果没有 N直接返回原序列
if not n_block_options:
return [seq]
# 使用 itertools.product 生成所有组合
from itertools import product as iter_product
all_combinations = list(iter_product(*n_block_options))
# 构建所有可能的序列
results = []
for combo in all_combinations:
new_seq = ""
n_index = 0
for seg_type, content in segments:
if seg_type == "seq":
new_seq += content
elif seg_type == "N":
new_seq += combo[n_index]
n_index += 1
results.append(new_seq)
return results
def decode_snp(label, ref_start=0):
if label is None:
return ""
if ":" in label:
label = label.split(":")[-1]
if ref_start <= 0 and not label.startswith("g."):
raise ValueError(f"{label} not genomic label")
elif ref_start > 0 and not label.startswith("c."):
raise ValueError(f"{label} not cdna label")
label = re.sub(r"([cg]\.|\[\d+\])", "", label)
sites = []
for x in label.split("_"):
if not x:
continue
x = re.sub(r"[^\d\+-]", "", x)
if "+" in x:
x = [int(y) for y in x.split("+")]
x = x[0] + x[-1]
elif "-" in x:
x = [int(y) for y in x.split("-")]
x = x[0] + x[-1]
else:
x = int(x)
sites.append(x + ref_start)
sites = sorted(sites)
rule = re.sub(r"[\d_\+-]", "", label)
return sites, rule.strip()
if __name__ == "__main__":
pass

263
design/src/snv-N-2N-3N.py Normal file
View File

@@ -0,0 +1,263 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import re
import sys
import itertools
import random
import gzip
from typing import List, Dict, Any, Optional
import pandas as pd
from loguru import logger
# ps
# 将目标序列按3bp密码子分割每个密码子系统性地生成三类突变
# 为每个突变自动附加100bp的上下游侧翼序列每类随机抽取150个突变
# Mutation design constants
NUCLEOTIDES = ["A", "T", "C", "G"]
UPSTREAM_LEN = 100
DOWNSTREAM_LEN = 100
TARGET_MUTATIONS = 150
class Region:
"""Represents a 3bp codon region in the full sequence."""
def __init__(
self, chrom: str, start: int, end: int, sequence: str, absolute_index: int
):
self.chrom = chrom
self.start = start
self.end = end
self.sequence = sequence.upper()
self.absolute_index = absolute_index
def read_fasta(fasta_path: str) -> Dict[str, str]:
"""Parses FASTA file, returning {header: sequence}."""
sequences = {}
current_header: Optional[str] = None
opener = gzip.open if fasta_path.endswith(".gz") else open
if not os.path.exists(fasta_path):
logger.error(f"FASTA file not found: {fasta_path}")
return {}
with opener(fasta_path, "rt") as f:
current_seq: List[str] = []
for line in f:
line = line.strip()
if not line:
continue
if line.startswith(">"):
if current_header and current_seq:
sequences[current_header] = (
"".join(current_seq).upper().replace("U", "T")
)
current_header = line[1:].split()[0]
current_seq = []
else:
if current_header:
current_seq.append(line)
if current_header and current_seq:
sequences[current_header] = "".join(current_seq).upper().replace("U", "T")
return sequences
def split_sequence_to_codons(full_seq: str, gene_name: str) -> List[Region]:
"""Splits full sequence into 3bp Region objects."""
regions: List[Region] = []
for i in range(0, len(full_seq), 3):
codon = full_seq[i : i + 3]
if len(codon) == 3:
regions.append(Region(gene_name, i, i + 2, codon, i))
return regions
def analyze_variant(ref: str, alt: str) -> str:
"""Simplifies substitution representation (e.g., 'CAT'->'CGT' becomes 'C(A/G)T')."""
if len(ref) != len(alt):
return f"({ref}/{alt})"
diffs = [
{"index": i, "ref_base": ref[i], "alt_base": alt[i]}
for i in range(len(ref))
if ref[i] != alt[i]
]
if not diffs:
return ref
positions = [d["index"] for d in diffs]
is_consecutive = all(
positions[i + 1] - positions[i] == 1 for i in range(len(positions) - 1)
)
# Rule 1: Continuous differences (e.g., 'CAT'->'CGC' becomes 'C(AT/GC)')
if is_consecutive:
start_pos, end_pos = positions[0], positions[-1] + 1
return f"{ref[:start_pos]}({ref[start_pos:end_pos]}/{alt[start_pos:end_pos]}){ref[end_pos:]}"
# Rule 2: Intermittent differences (e.g., 'GTT'->'GCG' becomes 'G(T/C)(T/G)')
out = []
prev_end = 0
for d in diffs:
pos, r, a = d["index"], d["ref_base"], d["alt_base"]
out.append(ref[prev_end:pos])
out.append(f"({r}/{a})")
prev_end = pos + 1
out.append(ref[prev_end:])
return "".join(out)
def generate_codon_mutations(original_codon: str, n_mutations: int) -> List[str]:
"""Generates all codon variants with exactly n_mutations."""
mutants = set()
codon_length = len(original_codon)
for indices in itertools.combinations(range(codon_length), n_mutations):
base_options: List[List[str]] = []
for i in range(codon_length):
if i in indices:
options = [b for b in NUCLEOTIDES if b != original_codon[i]]
else:
options = [original_codon[i]]
base_options.append(options)
for combination in itertools.product(*base_options):
mutant_codon = "".join(combination)
if mutant_codon != original_codon:
mutants.add(mutant_codon)
return sorted(list(mutants))
def generate_editseq_and_metadata(
full_seq: str, regions: List[Region], gene_name: str
) -> pd.DataFrame:
"""Generates all mutations (1N, 2N, 3N) and constructs the final DataFrame."""
df_list: List[Dict[str, str]] = []
for idx, x in enumerate(regions):
abs_start = x.absolute_index
original_codon = x.sequence
# 1. Extract flanking sequences
flank_up = full_seq[max(0, abs_start - UPSTREAM_LEN) : abs_start]
flank_down = full_seq[
abs_start + 3 : min(len(full_seq), abs_start + 3 + DOWNSTREAM_LEN)
]
for strategy, n_mut in [("3N", 3), ("2N", 2), ("1N", 1)]:
variants = generate_codon_mutations(original_codon, n_mut)
for mutation_codon in variants:
# Use analyze_variant to simplify representation
simplified_codon = analyze_variant(original_codon, mutation_codon)
# sequence_name: GENE_SUB_STRATEGY_AAINDEX_ORIGINAL>MUTATION
seq_name = f"{gene_name}_SUB_{strategy}_AA{idx + 1}_{original_codon}>{mutation_codon}"
# editseq: flank_up + simplified_codon + flank_down
edit_seq = f"{flank_up}{simplified_codon}{flank_down}"
df_list.append(
{
"sequence_name": seq_name,
"editseq": edit_seq,
"strategy": strategy,
"mutation_type": "REPL", # Replacement
}
)
return pd.DataFrame(df_list)
def run_mutation_design(fasta_file: str, gene_name: str, output_base_name: str):
"""Executes the mutation design pipeline and saves 3 separate files."""
logger.info(f"Targeting gene: {gene_name}")
fasta_data = read_fasta(fasta_file)
full_seq, target_id = "", ""
# Locate target sequence
for seq_id, seq in fasta_data.items():
if gene_name.upper() in seq_id.upper():
full_seq = seq
target_id = seq_id
break
if not full_seq and fasta_data:
# Fallback: use longest sequence
target_id, full_seq = max(fasta_data.items(), key=lambda item: len(item[1]))
if full_seq:
logger.warning(
f"Using longest sequence ID: {target_id} (Length: {len(full_seq)} bp)"
)
if not full_seq:
logger.error(f"Failed to extract target sequence.")
return
logger.info(f"Target sequence ID: {target_id}, Length: {len(full_seq)} bp")
# 1. Generate ALL mutations (1N, 2N, 3N)
cds_regions = split_sequence_to_codons(full_seq, gene_name)
all_mutations_df = generate_editseq_and_metadata(full_seq, cds_regions, gene_name)
# 2. Process and save
strategies = ["1N", "2N", "3N"]
for strategy in strategies:
# Filter for the current strategy
strategy_df = all_mutations_df[all_mutations_df["strategy"] == strategy].copy()
original_count = len(strategy_df)
# Determine output file name (e.g., AAVS1_1N_150_mutations.csv)
output_file_name = output_base_name.replace("{strategy}", strategy)
if original_count == 0:
logger.warning(
f"Strategy {strategy}: No mutations generated. Skipping file creation for {output_file_name}."
)
continue
# Random sampling for the current strategy
if original_count > TARGET_MUTATIONS:
final_df = strategy_df.sample(n=TARGET_MUTATIONS, random_state=42)
logger.success(
f"Strategy {strategy}: Sampled {TARGET_MUTATIONS} mutations from {original_count} designs."
)
else:
final_df = strategy_df
logger.warning(
f"Strategy {strategy}: Generated {original_count} mutations; saving all."
)
# Save result, ensuring column order
final_df[["sequence_name", "editseq", "strategy", "mutation_type"]].to_csv(
output_file_name, index=False
)
logger.success(f"Strategy {strategy}: Design saved to {output_file_name}.")
if __name__ == "__main__":
AAVS1_FASTA_PATH = (
"/rawdata1/project/peRNA_design/ref/AAVS1/ncbi_dataset/data/rna.fna"
)
GENE_NAME = "AAVS1"
OUTPUT_BASE_NAME = "AAVS1_{strategy}_150_mutations.csv"
run_mutation_design(
fasta_file=AAVS1_FASTA_PATH,
gene_name=GENE_NAME,
output_base_name=OUTPUT_BASE_NAME,
)

87
filter.py Normal file
View File

@@ -0,0 +1,87 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import gzip
from glob import glob
import pandas as pd
def amino_acid_to_codon(amino_acid):
"""
简化的氨基酸到密码子转换函数
参数:
amino_acid (str): 单字母氨基酸代码
返回:
list: 可能的密码子列表
"""
genetic_code = {
'A': ['GCT', 'GCC', 'GCA', 'GCG'],
'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
'N': ['AAT', 'AAC'],
'D': ['GAT', 'GAC'],
'C': ['TGT', 'TGC'],
'E': ['GAA', 'GAG'],
'Q': ['CAA', 'CAG'],
'G': ['GGT', 'GGC', 'GGA', 'GGG'],
'H': ['CAT', 'CAC'],
'I': ['ATT', 'ATC', 'ATA'],
'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
'K': ['AAA', 'AAG'],
'M': ['ATG'],
'F': ['TTT', 'TTC'],
'P': ['CCT', 'CCC', 'CCA', 'CCG'],
'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
'T': ['ACT', 'ACC', 'ACA', 'ACG'],
'W': ['TGG'],
'Y': ['TAT', 'TAC'],
'V': ['GTT', 'GTC', 'GTA', 'GTG'],
'*': ['TAA', 'TAG', 'TGA'],
}
return genetic_code.get(amino_acid.upper(), [])
def main(ref, infile, outfile):
print(infile, outfile)
df = pd.read_excel(ref, 1)
keys = {}
for _, row in df.iterrows():
row = list(row)
for src in amino_acid_to_codon(row[1]):
keys[f"{src}_{row[2]}"] = 0
if os.path.dirname(outfile):
os.makedirs(os.path.dirname(outfile), exist_ok = True)
header = False
with gzip.open(outfile, "wt+") as w:
with gzip.open(infile, "rt") as r:
for line in r:
if not header:
w.write(line.strip() + "\n")
header = line.strip().split(",")
continue
try:
target = header.index("sequence_name")
except ValueError:
target = header.index("Target_name")
key = line.strip().split(",")[target]
key = key.split("_")[2:]
key = "_".join(key).strip('"')
if key in keys:
w.write(line.strip() + "\n")
if __name__ == '__main__':
from fire import Fire
Fire(main)

210
filter_freq.py Normal file
View File

@@ -0,0 +1,210 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import gzip
import heapq
import click
import csv
import polars as pl
from multiprocessing import Pool
from glob import glob
from tqdm import tqdm
def amino_acid_to_codon():
"""
简化的氨基酸到密码子转换函数
参数:
amino_acid (str): 单字母氨基酸代码
返回:
list: 可能的密码子列表
"""
genetic_code = {
'A': ['GCT', 'GCC', 'GCA', 'GCG'],
'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
'N': ['AAT', 'AAC'],
'D': ['GAT', 'GAC'],
'C': ['TGT', 'TGC'],
'E': ['GAA', 'GAG'],
'Q': ['CAA', 'CAG'],
'G': ['GGT', 'GGC', 'GGA', 'GGG'],
'H': ['CAT', 'CAC'],
'I': ['ATT', 'ATC', 'ATA'],
'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
'K': ['AAA', 'AAG'],
'M': ['ATG'],
'F': ['TTT', 'TTC'],
'P': ['CCT', 'CCC', 'CCA', 'CCG'],
'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
'T': ['ACT', 'ACC', 'ACA', 'ACG'],
'W': ['TGG'],
'Y': ['TAT', 'TAC'],
'V': ['GTT', 'GTC', 'GTA', 'GTG'],
'*': ['TAA', 'TAG', 'TGA'],
}
codes = []
for val in genetic_code.values():
codes += val
return set(codes) # genetic_code.get(amino_acid.upper(), [])
__CODONS__ = amino_acid_to_codon()
def reader(path: str, rt_len: int = 24):
"""
流式读取 CSV 文件,逐行返回 dict。
内存占用恒定(只缓存一行),适合 GB 级文件。
"""
with gzip.open(path, "rt", newline="") as f:
for row in csv.DictReader(f):
try:
if float(row["RTlength"]) <= rt_len:
yield row
except TypeError:
continue
def __check_target__(key: str):
if ">" in key:
key = key.replace(">", "_")
keys = key.split("_")
return keys[-1] in __CODONS__
def __decode_codon_n__(key: str) -> str:
# BRIP1_AA580_CTC_CTT
if ">" in key:
key = key.replace(">", "_")
keys = key.split("_")
try:
res = []
for x, y in zip(keys[-2], keys[-1]):
if x == y:
res.append(x)
else:
res.append("N")
keys[-1] = "".join(res)
except IndexError as err:
print(keys)
raise err
return "_".join(keys)
def __call_func__(args):
u""" 实际处理代码 """
f, outdir, top_n, degenerate = args
data = {}
# 读取文件
for rec in tqdm(reader(f)):
# 根据设定好的sequence名称
key = rec["sequence_name"]
if not __check_target__(key):
# 如果target不是已知的编码氨基酸的codon则跳过
continue
if degenerate:
try:
key = __decode_codon_n__(rec["sequence_name"])
rec["orig_seq_name"] = rec.pop("sequence_name")
rec["sequence_name"] = key
except IndexError:
continue
if key not in data:
data[key] = []
# 数据heap化
if "DeepCas9score" in rec.keys():
k = "DeepCas9score"
elif "PRIDICT2_0_editing_Score_deep_K562" in rec.keys():
k = "PRIDICT2_0_editing_Score_deep_K562"
else:
print(f, rec)
continue
# raise ValueError(f"PRIDICT2_0_editing_Score_deep_K562 not exists in {f}")
try:
score = float(rec[k])
except (ValueError, KeyError) as e:
print(f"Warning: Skipping invalid record in {f}: {rec}")
continue # 或 raise根据需求
if len(data[key]) < top_n:
heapq.heappush(data[key], (score, rec))
else:
try:
if score > data[key][0][0]:
heapq.heapreplace(data[key], (score, rec))
except TypeError as err:
print(err)
print(key)
print(score)
print(len(data[key]))
raise err
# 第二遍:整理结果(按 score 降序)
final_records = []
for heap in data.values():
# 从堆中取出并按 score 降序排列
sorted_recs = [rec for _, rec in sorted(heap, key=lambda x: x[0], reverse=True)]
final_records.extend(sorted_recs)
if not final_records:
print(f"No valid records in {f}, skipping output.")
return
# 安全写入 CSV使用 csv 模块)
output_path = os.path.join(outdir, os.path.basename(f))
with gzip.open(output_path, "wt+", newline="", encoding="utf-8") as w:
writer = csv.DictWriter(w, fieldnames=final_records[0].keys(), quoting=csv.QUOTE_MINIMAL)
writer.writeheader()
writer.writerows(final_records)
@click.command()
@click.option("-i", "--indir", type=str, help="字符串形式的输入路径,可以*通配多个文件和目录")
@click.option("-o", "--outdir", type=str, help="输出目录")
@click.option("-t", "--top-n", type=int, help="选择前几", default=3)
@click.option("-n", "--degenerate", is_flag=True, help="是否使用兼并碱基")
@click.argument('args', nargs=-1) # 捕获所有位置参数
def main(indir, outdir, top_n, degenerate, args):
if not indir and len(args) > 0:
indir = args[0]
if not outdir and len(args) > 0:
outdir = args[-1]
if indir == outdir:
raise ValueError("indir and outdir should not be the same")
os.makedirs(outdir, exist_ok=True)
# 获取输入文件,生成参数
args = [[f, outdir, top_n, degenerate] for f in glob(indir)]
# for arg in args:
# print(arg[0])
# __call_func__(arg)
with Pool(len(args)) as p:
list(tqdm(p.imap(__call_func__, args), total=len(args)))
if __name__ == '__main__':
main()

0
interactive/README.md Normal file
View File

222
interactive/db.py Normal file
View File

@@ -0,0 +1,222 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import peewee as pw
import re
import csv
import gzip
from typing import Dict
db = pw.SqliteDatabase("./pegrna.db")
class BaseModel(pw.Model):
class Meta:
database = db
KEY_MAP = {
"pridict2": {
"sequence_name": "sequence",
"EditedAllele": "dst",
"OriginalAllele": "src",
"PRIDICT2_0_editing_Score_deep_K562": "k562",
"PRIDICT2_0_editing_Score_deep_HEK": "hek",
"K562_rank": "k562_rank",
"HEK_rank": "hek_rank",
"PRIDICT2_Format": "template",
"Target-Strand": "strand",
"PBSlength": "pbs_len",
"RToverhanglength": "rtt_oh_len",
"RTlength": "rtt_len",
"Spacer-Sequence": "spacer",
"Scaffold_Optimized": "scaffold",
"pegRNA": "pegrna",
"PBSrevcomp": "pbs",
"RTseqoverhangrevcomp": "rtt_oh",
"RTrevcomp": "rtt",
},
"prime_design": {
"Target_name": "sequence",
# "": "dst",
# "": "src",
"Target_sequence": "template",
"Strand": "strand",
"PBS_length": "pbs_len",
"RTT_length": "rtt_len",
"Spacer_sequence": "spacer",
"PAM_sequence": "pam",
"Extension_sequence": "extension", # RTT + PBS
"Spacer_sequence_order_TOP": "before_spacer",
"Spacer_sequence_order_BOTTOM": "after_spacer",
"pegRNA_extension_sequence_order_TOP": "before_pegnra_ext",
"pegRNA_extension_sequence_order_BOTTOM": "after_pegnra_ext",
}
}
def bulk_insert(table, data, chunk = 100):
with db.atomic():
for i in range(0, len(data), chunk):
table.insert_many(data[i:i + chunk]).execute()
class Pridict2(BaseModel):
gene = pw.CharField()
aa = pw.IntegerField()
sequence = pw.CharField()
src = pw.CharField()
dst = pw.CharField()
k562 = pw.FloatField()
hek = pw.FloatField()
k562_rank = pw.IntegerField()
hek_rank = pw.IntegerField()
template = pw.CharField()
strand = pw.CharField()
pbs_len = pw.IntegerField()
rtt_oh_len = pw.IntegerField()
rtt_len = pw.IntegerField()
spacer = pw.CharField()
scaffold = pw.CharField()
pegrna = pw.CharField()
pbs = pw.CharField()
rtt_oh = pw.CharField()
rtt = pw.CharField()
class Meta:
table_name = "pridict2"
class PrimeDesign(BaseModel):
gene = pw.CharField()
aa = pw.IntegerField()
sequence = pw.CharField()
src = pw.CharField()
dst = pw.CharField()
template = pw.CharField()
strand = pw.CharField()
pbs_len = pw.IntegerField()
rtt_len = pw.IntegerField()
pam = pw.CharField()
spacer = pw.CharField()
extension = pw.CharField()
pbs = pw.CharField()
rtt = pw.CharField()
before_spacer = pw.CharField()
after_spacer = pw.CharField()
before_pegnra_ext = pw.CharField()
after_pegnra_ext= pw.CharField()
class Meta:
table_name = "prime_design"
def format_data(value: Dict, mapping: Dict[str, str]) -> Dict[str, any]:
res = {}
for key, value in value.items():
if key in mapping.keys():
res[mapping[key]] = value
if not res.get("src"):
res["src"] = res["sequence"].split("_")[-2]
res["dst"] = res["sequence"].split("_")[-1]
res["aa"] = int(re.sub(r"\D", "", res["sequence"].split("_")[1]))
res["gene"] = res["sequence"].split("_")[0]
if not res.get("pbs") and res.get("extension") and res.get("pbs_len") and res.get("rtt_len"):
if len(res["extension"]) == int(res["pbs_len"]) + int(res["rtt_len"]):
res["pbs"] = res["extension"][:int(res["pbs_len"])]
res["rtt"] = res["extension"][int(res["pbs_len"]):]
return res
def insert(path: str, kind: str = "PRIDICT2", chunk: int = 10000):
if not Pridict2.table_exists():
Pridict2.create_table()
if not PrimeDesign.table_exists():
PrimeDesign.create_table()
kind = kind.lower()
assert kind in KEY_MAP.keys()
data = []
rows = 0
with gzip.open(path, 'rt', encoding='utf-8') as file:
csv_dict_reader = csv.DictReader(file)
# 逐行读取,每行是一个字典
for row in csv_dict_reader:
# 通过列名访问数据
data.append(format_data(row, KEY_MAP[kind]))
rows += 1
if len(data) >= chunk:
print(f"finished {rows} rows")
bulk_insert(Pridict2 if kind == "pridict2" else PrimeDesign, data)
data = []
if data:
bulk_insert(Pridict2 if kind == "pridict2" else PrimeDesign, data)
def index():
# 创建简单索引
for i in [
Pridict2.gene,
Pridict2.aa,
Pridict2.sequence,
Pridict2.dst,
Pridict2.src,
Pridict2.k562,
Pridict2.hek,
Pridict2.pbs_len,
Pridict2.rtt_len,
]:
print(Pridict2.__name__, i.name)
sql = f"CREATE INDEX IF NOT EXISTS {Pridict2.__name__}_{i.name}_idx ON pridict2 ({i.name});"
db.execute_sql(sql)
for i in [
PrimeDesign.gene,
PrimeDesign.aa,
PrimeDesign.sequence,
PrimeDesign.dst,
PrimeDesign.src,
PrimeDesign.pbs_len,
PrimeDesign.rtt_len,
]:
print(PrimeDesign.__name__, i.name)
sql = f"CREATE INDEX IF NOT EXISTS {PrimeDesign.__name__}_{i.name}_idx ON prime_design ({i.name});"
db.execute_sql(sql)
def table_columns(table):
return {x: y for x, y in table.__dict__.items() if "__" not in x}
if __name__ == "__main__":
print(table_columns(Pridict2))
pass

View File

@@ -0,0 +1,5 @@
# Vue 3 + TypeScript + Vite
This template should help get you started developing with Vue 3 and TypeScript in Vite. The template uses Vue 3 `<script setup>` SFCs, check out the [script setup docs](https://v3.vuejs.org/api/sfc-script-setup.html#sfc-script-setup) to learn more.
Learn more about the recommended Project Setup and IDE Support in the [Vue Docs TypeScript Guide](https://vuejs.org/guide/typescript/overview.html#project-setup).

77
interactive/frontend/auto-imports.d.ts vendored Normal file
View File

@@ -0,0 +1,77 @@
/* eslint-disable */
/* prettier-ignore */
// @ts-nocheck
// noinspection JSUnusedGlobalSymbols
// Generated by unplugin-auto-import
// biome-ignore lint: disable
export {}
declare global {
const EffectScope: typeof import('vue').EffectScope
const computed: typeof import('vue').computed
const createApp: typeof import('vue').createApp
const customRef: typeof import('vue').customRef
const defineAsyncComponent: typeof import('vue').defineAsyncComponent
const defineComponent: typeof import('vue').defineComponent
const effectScope: typeof import('vue').effectScope
const getCurrentInstance: typeof import('vue').getCurrentInstance
const getCurrentScope: typeof import('vue').getCurrentScope
const getCurrentWatcher: typeof import('vue').getCurrentWatcher
const h: typeof import('vue').h
const inject: typeof import('vue').inject
const isProxy: typeof import('vue').isProxy
const isReactive: typeof import('vue').isReactive
const isReadonly: typeof import('vue').isReadonly
const isRef: typeof import('vue').isRef
const isShallow: typeof import('vue').isShallow
const markRaw: typeof import('vue').markRaw
const nextTick: typeof import('vue').nextTick
const onActivated: typeof import('vue').onActivated
const onBeforeMount: typeof import('vue').onBeforeMount
const onBeforeUnmount: typeof import('vue').onBeforeUnmount
const onBeforeUpdate: typeof import('vue').onBeforeUpdate
const onDeactivated: typeof import('vue').onDeactivated
const onErrorCaptured: typeof import('vue').onErrorCaptured
const onMounted: typeof import('vue').onMounted
const onRenderTracked: typeof import('vue').onRenderTracked
const onRenderTriggered: typeof import('vue').onRenderTriggered
const onScopeDispose: typeof import('vue').onScopeDispose
const onServerPrefetch: typeof import('vue').onServerPrefetch
const onUnmounted: typeof import('vue').onUnmounted
const onUpdated: typeof import('vue').onUpdated
const onWatcherCleanup: typeof import('vue').onWatcherCleanup
const provide: typeof import('vue').provide
const reactive: typeof import('vue').reactive
const readonly: typeof import('vue').readonly
const ref: typeof import('vue').ref
const resolveComponent: typeof import('vue').resolveComponent
const shallowReactive: typeof import('vue').shallowReactive
const shallowReadonly: typeof import('vue').shallowReadonly
const shallowRef: typeof import('vue').shallowRef
const toRaw: typeof import('vue').toRaw
const toRef: typeof import('vue').toRef
const toRefs: typeof import('vue').toRefs
const toValue: typeof import('vue').toValue
const triggerRef: typeof import('vue').triggerRef
const unref: typeof import('vue').unref
const useAttrs: typeof import('vue').useAttrs
const useCssModule: typeof import('vue').useCssModule
const useCssVars: typeof import('vue').useCssVars
const useDialog: typeof import('naive-ui').useDialog
const useId: typeof import('vue').useId
const useLoadingBar: typeof import('naive-ui').useLoadingBar
const useMessage: typeof import('naive-ui').useMessage
const useModel: typeof import('vue').useModel
const useNotification: typeof import('naive-ui').useNotification
const useSlots: typeof import('vue').useSlots
const useTemplateRef: typeof import('vue').useTemplateRef
const watch: typeof import('vue').watch
const watchEffect: typeof import('vue').watchEffect
const watchPostEffect: typeof import('vue').watchPostEffect
const watchSyncEffect: typeof import('vue').watchSyncEffect
}
// for type re-export
declare global {
// @ts-ignore
export type { Component, Slot, Slots, ComponentPublicInstance, ComputedRef, DirectiveBinding, ExtractDefaultPropTypes, ExtractPropTypes, ExtractPublicPropTypes, InjectionKey, PropType, Ref, ShallowRef, MaybeRef, MaybeRefOrGetter, VNode, WritableComputedRef } from 'vue'
import('vue')
}

30
interactive/frontend/components.d.ts vendored Normal file
View File

@@ -0,0 +1,30 @@
/* eslint-disable */
// @ts-nocheck
// biome-ignore lint: disable
// oxlint-disable
// ------
// Generated by unplugin-vue-components
// Read more: https://github.com/vuejs/core/pull/3399
export {}
/* prettier-ignore */
declare module 'vue' {
export interface GlobalComponents {
HelloWorld: typeof import('./src/components/HelloWorld.vue')['default']
NConfigProvider: typeof import('naive-ui')['NConfigProvider']
NDataTable: typeof import('naive-ui')['NDataTable']
NFlex: typeof import('naive-ui')['NFlex']
NForm: typeof import('naive-ui')['NForm']
NFormItem: typeof import('naive-ui')['NFormItem']
NGi: typeof import('naive-ui')['NGi']
NGrid: typeof import('naive-ui')['NGrid']
NInputNumber: typeof import('naive-ui')['NInputNumber']
NLayout: typeof import('naive-ui')['NLayout']
NLayoutContent: typeof import('naive-ui')['NLayoutContent']
NLayoutHeader: typeof import('naive-ui')['NLayoutHeader']
NMessageProvider: typeof import('naive-ui')['NMessageProvider']
NPagination: typeof import('naive-ui')['NPagination']
NSelect: typeof import('naive-ui')['NSelect']
}
}

View File

@@ -0,0 +1,13 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>frontend</title>
</head>
<body>
<div id="app"></div>
<script type="module" src="/src/main.ts"></script>
</body>
</html>

View File

@@ -0,0 +1,26 @@
{
"name": "frontend",
"private": true,
"version": "0.0.0",
"type": "module",
"scripts": {
"dev": "vite",
"build": "vue-tsc -b && vite build",
"preview": "vite preview"
},
"dependencies": {
"axios": "^1.13.5",
"naive-ui": "^2.43.2",
"unplugin-auto-import": "^21.0.0",
"unplugin-vue-components": "^31.0.0",
"vue": "^3.5.25"
},
"devDependencies": {
"@types/node": "^24.10.1",
"@vitejs/plugin-vue": "^6.0.2",
"@vue/tsconfig": "^0.8.1",
"typescript": "~5.9.3",
"vite": "^7.3.1",
"vue-tsc": "^3.1.5"
}
}

1554
interactive/frontend/pnpm-lock.yaml generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="iconify iconify--logos" width="31.88" height="32" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 257"><defs><linearGradient id="IconifyId1813088fe1fbc01fb466" x1="-.828%" x2="57.636%" y1="7.652%" y2="78.411%"><stop offset="0%" stop-color="#41D1FF"></stop><stop offset="100%" stop-color="#BD34FE"></stop></linearGradient><linearGradient id="IconifyId1813088fe1fbc01fb467" x1="43.376%" x2="50.316%" y1="2.242%" y2="89.03%"><stop offset="0%" stop-color="#FFEA83"></stop><stop offset="8.333%" stop-color="#FFDD35"></stop><stop offset="100%" stop-color="#FFA800"></stop></linearGradient></defs><path fill="url(#IconifyId1813088fe1fbc01fb466)" d="M255.153 37.938L134.897 252.976c-2.483 4.44-8.862 4.466-11.382.048L.875 37.958c-2.746-4.814 1.371-10.646 6.827-9.67l120.385 21.517a6.537 6.537 0 0 0 2.322-.004l117.867-21.483c5.438-.991 9.574 4.796 6.877 9.62Z"></path><path fill="url(#IconifyId1813088fe1fbc01fb467)" d="M185.432.063L96.44 17.501a3.268 3.268 0 0 0-2.634 3.014l-5.474 92.456a3.268 3.268 0 0 0 3.997 3.378l24.777-5.718c2.318-.535 4.413 1.507 3.936 3.838l-7.361 36.047c-.495 2.426 1.782 4.5 4.151 3.78l15.304-4.649c2.372-.72 4.652 1.36 4.15 3.788l-11.698 56.621c-.732 3.542 3.979 5.473 5.943 2.437l1.313-2.028l72.516-144.72c1.215-2.423-.88-5.186-3.54-4.672l-25.505 4.922c-2.396.462-4.435-1.77-3.759-4.114l16.646-57.705c.677-2.35-1.37-4.583-3.769-4.113Z"></path></svg>

After

Width:  |  Height:  |  Size: 1.5 KiB

View File

@@ -0,0 +1,26 @@
<script setup lang="ts">
import HelloWorld from './components/HelloWorld.vue'
</script>
<template>
<n-config-provider>
<n-message-provider>
<HelloWorld />
</n-message-provider>
</n-config-provider>
</template>
<style scoped>
.logo {
height: 6em;
padding: 1.5em;
will-change: filter;
transition: filter 300ms;
}
.logo:hover {
filter: drop-shadow(0 0 2em #646cffaa);
}
.logo.vue:hover {
filter: drop-shadow(0 0 2em #42b883aa);
}
</style>

View File

@@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="iconify iconify--logos" width="37.07" height="36" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 198"><path fill="#41B883" d="M204.8 0H256L128 220.8L0 0h97.92L128 51.2L157.44 0h47.36Z"></path><path fill="#41B883" d="m0 0l128 220.8L256 0h-51.2L128 132.48L50.56 0H0Z"></path><path fill="#35495E" d="M50.56 0L128 133.12L204.8 0h-47.36L128 51.2L97.92 0H50.56Z"></path></svg>

After

Width:  |  Height:  |  Size: 496 B

View File

@@ -0,0 +1,375 @@
<script setup lang="ts">
import {onMounted, ref, h, watch} from 'vue'
import axios, { AxiosError, type AxiosResponse } from "axios";
import type {SelectOption, DataTableSortState} from "naive-ui"
import { useMessage, NEllipsis, NButton, NPopover } from "naive-ui"
const BASE_URL = "/api" // http://10.126.126.11:5555/api
const APIs = {
unique: `${BASE_URL}/gene`,
content: `${BASE_URL}/records`,
};
const message = useMessage();
const genes = ref<SelectOption[]|null>(null);
const tables = [
{"value": "pridict2", "label": "Pridict2"},
{"value": "prime_design", "label": "Prime Design"},
]
interface FormData {
gene: string | null
source: string
pbs_len: number | null
rtt_len: number | null
}
const formData = ref<FormData>({
source: "pridict2",
gene: null,
pbs_len: 0,
rtt_len: 0,
})
interface Pagination {
order: string;
order_by: string;
total: number;
page: number;
length: number;
}
const pagination = ref<Pagination>({
order: "desc",
order_by: "gene",
total: 10,
length: 10,
page: 1,
});
function processSorter(
options: DataTableSortState | DataTableSortState[] | null,
) {
if (options !== null) {
options = options as DataTableSortState;
pagination.value.order_by = options.columnKey.toString();
pagination.value.order =
typeof options.order === "boolean" ? "asc" : options.order;
} else {
pagination.value.order_by = "id";
pagination.value.order = "descend";
}
}
interface RowData {
id: number;
gene: string
aa: number
sequence: string
src: string
dst: string
k562: number|null
hek: number|null
k562_rank: number|null
hek_rank: number|null
template: string
strand: string
pbs_len: number
rtt_len: number
spacer: string|null
scaffold: string|null
pegrna: string|null
pbs: string
rtt: string
extension: string|null
before_spacer: string|null
after_spacer: string|null
before_pegnra_ext: string|null
after_pegnra_ext: string|null
}
let columns = [
{
title: "Gene", key: "gene", defaultSortOrder: "descend", width: 60, resizable: true,
},
{
title: "AA (n)", key: "aa", defaultSortOrder: "ascend", width: 50, resizable: true, sorter: true,
},
{
title: "name", key: "sequence", defaultSortOrder: "ascend", width: 100, resizable: true, sorter: true,
},
{
title: "原序列", key: "src", defaultSortOrder: "ascend", width: 60, resizable: true, sorter: true,
},
{
title: "编辑后", key: "dst", defaultSortOrder: "ascend", width: 60, resizable: true, sorter: true,
},
]
let post_columns = [
{title: "strand", key: "strand", defaultSortOrder: "ascend", maxWidth: 20, resizable: true,},
{title: "PBS len", key: "pbs_len", defaultSortOrder: "ascend", maxWidth: 20, resizable: true, sorter: true,},
{title: "RTT len", key: "rtt_len", defaultSortOrder: "ascend", maxWidth: 20, resizable: true, sorter: true,},
{title: "PBS", key: "pbs", defaultSortOrder: "ascend", width: 120, resizable: true,},
{title: "RTT", key: "rtt", defaultSortOrder: "ascend", width: 120, resizable: true,}
]
const copyText = (text: string) => {
const textArea = document.createElement("textarea");
textArea.value = text;
textArea.style.top = "0";
textArea.style.left = "0";
textArea.style.position = "fixed";
document.body.appendChild(textArea);
textArea.focus();
textArea.select();
try {
document.execCommand("copy");
document.body.removeChild(textArea);
return Promise.resolve();
} catch (err) {
document.body.removeChild(textArea);
return Promise.reject(err);
}
};
const createColumns = () => {
let real_columns = [...columns]
let rest_columns = ["spacer", "extension", "before_spacer", "after_spacer", "before_pegnra_ext", "after_pegnra_ext"]
if (formData.value.source === "pridict2") {
rest_columns = ["template", "spacer", "scaffold", "pegrna"]
for (let i of ["k562", "hek"]) {
real_columns.push(
{
title: i, key: i, width: 60, resizable: true, sorter: true,
render: function(row: RowData) {
return h(
NPopover, {
trigger: "hover"
}, {
trigger: () => `${parseFloat(row[i].toFixed(2))} (${row[i + "_rank"]})`,
default: () => {`Score: ${row[i]}; Rank=${row[i + "_rank"]}`}
}
)
}
},
)
}
}
real_columns = real_columns.concat(post_columns)
for (let i of rest_columns) {
real_columns.push({
title:i,
key: i,
width: 240,
resizable: true,
render: (row: RowData) => {
return h(
NButton, {
size:"small", type: "primary", dashed: true,
onClick: () => {copyText(row[i])}
},
{
default: () => {
return h(
NEllipsis, {
style: "max-width: 200px",
tooltip: {
style: {
maxWidth: '300px',
whiteSpace: 'pre-wrap', // 关键:允许换行
wordBreak: 'break-word' // 允许单词内换行
}
}
},
{default: () => row[i]}
)
}
}
)
}
})
}
return real_columns
};
const loading = ref(false)
const data = ref<RowData[]>([]);
const getRecords = () => {
loading.value = true;
let params = {
gene: formData.value.gene,
pbs_len: formData.value.pbs_len,
rtt_len: formData.value.rtt_len,
source: formData.value.source,
offset: pagination.value.page,
length: pagination.value.length,
order_by: pagination.value.order_by,
order: pagination.value.order,
}
axios
.get(APIs.content, { params: params })
.then((response: AxiosResponse) => {
let resp = response.data;
data.value = resp.data;
if (resp.total !== pagination.value.total) {
pagination.value.total = resp.total;
}
if (resp.length !== pagination.value.length) {
pagination.value.length = resp.length;
}
if (pagination.value.page > Math.ceil(resp.total / resp.length)) {
pagination.value.page = Math.ceil(resp.total / resp.length);
}
})
.catch((error: Error | AxiosError) => {
message.error(error.message);
}).finally(() => {
loading.value = false;
});
}
onMounted(() => {
axios.get(APIs.unique).then((response: AxiosResponse) => {
let res = []
for (let i of response.data) {
res.push({"value": i, "label": i})
}
genes.value = res
formData.value.gene = res[0].value
}).finally(() => {
getRecords()
})
})
watch(
() => [formData, pagination],
(_) => {
axios.get(APIs.unique, {params: {source: formData.value.source}}).then((response: AxiosResponse) => {
let res = []
for (let i of response.data) {
res.push({"value": i, "label": i})
}
genes.value = res
}).finally(() => {
getRecords()
})
},
{ deep: true },
);
</script>
<template>
<n-grid cols="24" :y-gap="8" item-responsive>
<n-gi span="0 400:1 800:2" responsive="self" />
<n-gi span="24 400:22 600:20" responsive="self">
<n-layout>
<n-layout-header style="min-height: 30px; padding: 10px" bordered>
<n-form label-placement="left">
<n-flex justify="space-around" style="margin-right: 10px">
<!-- 查询界面 -->
<n-grid cols="4" :x-gap="12" :y-gap="8" item-responsive>
<n-gi span="4 400:2 800:1" responsive="self">
<n-form-item label="表">
<n-select v-model:value="formData.source" :options="tables" filterable clearable/>
</n-form-item>
</n-gi>
<n-gi span="4 400:2 800:1" responsive="self">
<n-form-item label="基因">
<n-select v-model:value="formData.gene" :options="genes" filterable clearable/>
</n-form-item>
</n-gi>
<n-gi span="4 400:2 800:1" responsive="self">
<n-form-item label="PBS len <= ">
<n-input-number v-model:value="formData.pbs_len" clearable/>
</n-form-item>
</n-gi>
<n-gi span="4 400:2 800:1" responsive="self">
<n-form-item label="RTT len <= ">
<n-input-number v-model:value="formData.rtt_len" clearable/>
</n-form-item>
</n-gi>
</n-grid>
</n-flex>
</n-form>
</n-layout-header>
<n-layout-content style="padding-top: 10px; padding-left: 5px" bordered>
<n-flex justify="center">
<n-pagination
v-model:page="pagination.page"
:page-sizes="[10, 20, 30, 40]"
:item-count="pagination.total"
v-model:page-size="pagination.length"
show-quick-jumper
show-size-picker
style="padding: 5px"
/>
</n-flex>
<n-data-table
:columns="createColumns()"
:data="data"
:loading="loading"
:scroll-x="1800"
width="100%"
:max-height="600"
:row-key="
(row: RowData) => (row.id)
"
striped
bordered
@update:sorter="processSorter"
sticky-expanded-rows
/>
<n-flex justify="center">
<n-pagination
v-model:page="pagination.page"
:page-sizes="[10, 20, 30, 40]"
:item-count="pagination.total"
v-model:page-size="pagination.length"
show-quick-jumper
show-size-picker
style="padding: 5px"
/>
</n-flex>
</n-layout-content>
</n-layout>
</n-gi>
</n-grid>
</template>
<style scoped>
/* 关键:为表格容器设置固定宽度和溢出控制 */
.table-container {
width: 100%; /* 或者固定宽度,如 1200px */
overflow-x: auto; /* 确保容器可水平滚动 */
}
</style>

View File

@@ -0,0 +1,4 @@
import { createApp } from "vue";
import App from "./App.vue";
createApp(App).mount("#app");

View File

@@ -0,0 +1,79 @@
:root {
font-family: system-ui, Avenir, Helvetica, Arial, sans-serif;
line-height: 1.5;
font-weight: 400;
color-scheme: light dark;
color: rgba(255, 255, 255, 0.87);
background-color: #242424;
font-synthesis: none;
text-rendering: optimizeLegibility;
-webkit-font-smoothing: antialiased;
-moz-osx-font-smoothing: grayscale;
}
a {
font-weight: 500;
color: #646cff;
text-decoration: inherit;
}
a:hover {
color: #535bf2;
}
body {
margin: 0;
display: flex;
place-items: center;
min-width: 320px;
min-height: 100vh;
}
h1 {
font-size: 3.2em;
line-height: 1.1;
}
button {
border-radius: 8px;
border: 1px solid transparent;
padding: 0.6em 1.2em;
font-size: 1em;
font-weight: 500;
font-family: inherit;
background-color: #1a1a1a;
cursor: pointer;
transition: border-color 0.25s;
}
button:hover {
border-color: #646cff;
}
button:focus,
button:focus-visible {
outline: 4px auto -webkit-focus-ring-color;
}
.card {
padding: 2em;
}
#app {
max-width: 1280px;
margin: 0 auto;
padding: 2rem;
text-align: center;
}
@media (prefers-color-scheme: light) {
:root {
color: #213547;
background-color: #ffffff;
}
a:hover {
color: #747bff;
}
button {
background-color: #f9f9f9;
}
}

View File

@@ -0,0 +1,16 @@
{
"extends": "@vue/tsconfig/tsconfig.dom.json",
"compilerOptions": {
"tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo",
"types": ["vite/client"],
/* Linting */
"strict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"erasableSyntaxOnly": true,
"noFallthroughCasesInSwitch": true,
"noUncheckedSideEffectImports": true
},
"include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.vue"]
}

View File

@@ -0,0 +1,7 @@
{
"files": [],
"references": [
{ "path": "./tsconfig.app.json" },
{ "path": "./tsconfig.node.json" }
]
}

View File

@@ -0,0 +1,26 @@
{
"compilerOptions": {
"tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo",
"target": "ES2023",
"lib": ["ES2023"],
"module": "ESNext",
"types": ["node"],
"skipLibCheck": true,
/* Bundler mode */
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"verbatimModuleSyntax": true,
"moduleDetection": "force",
"noEmit": true,
/* Linting */
"strict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"erasableSyntaxOnly": true,
"noFallthroughCasesInSwitch": true,
"noUncheckedSideEffectImports": true
},
"include": ["vite.config.ts"]
}

View File

@@ -0,0 +1,38 @@
import vue from "@vitejs/plugin-vue";
import AutoImport from "unplugin-auto-import/vite";
import { NaiveUiResolver } from "unplugin-vue-components/resolvers";
import Components from "unplugin-vue-components/vite";
// vite.config.ts
import { defineConfig } from "vite";
// https://vitejs.dev/config/
export default defineConfig({
server: {
watch: {
// 使用轮询模式,避免文件描述符问题
usePolling: true,
interval: 1000,
// 忽略不需要监视的目录
ignored: ["**/node_modules/**", "**/.git/**", "**/.next/**"],
},
},
plugins: [
vue(),
AutoImport({
imports: [
"vue",
{
"naive-ui": [
"useDialog",
"useMessage",
"useNotification",
"useLoadingBar",
],
},
],
}),
Components({
resolvers: [NaiveUiResolver()],
}),
],
});

133
interactive/main.py Normal file
View File

@@ -0,0 +1,133 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
from peewee import SQL
from flask import Flask, jsonify, request, abort, send_from_directory
from flask_cors import CORS
from db import insert, index, Pridict2, PrimeDesign, table_columns
app = Flask(__name__, static_folder="./frontend/dist")
CORS(app)
@app.route("/<path:filename>")
def static_files(filename):
"""专门处理带扩展名的文件"""
if "." not in filename:
abort(404) # 无扩展名不应走这里
try:
return send_from_directory(app.static_folder, filename)
except FileNotFoundError:
abort(404) # 静态文件不存在就是 404
@app.route("/", defaults={"path": ""})
@app.route("/<path:path>")
def main(path):
"""仅处理 SPA 路由(无扩展名)"""
if "." in os.path.basename(path):
# 包含扩展名?说明应该是静态文件,但没被上面的路由捕获 → 404
abort(404)
return send_from_directory(app.static_folder, "index.html")
def default_value(val, default):
try:
return int(val)
except Exception:
return default
@app.route("/api/gene")
def gene():
genes = set()
source = request.args.get("source", "pridict2")
tables = {
"pridict2": Pridict2,
"prime_design": PrimeDesign,
}
table = tables.get(source)
if not table:
return jsonify({"message": "No such table"}), 404
for i in table.select(table.gene.distinct()):
genes.add(i.gene)
return jsonify(sorted(genes))
@app.route("/api/records")
def records():
source = request.args.get("source", "pridict2")
tables = {
"pridict2": Pridict2,
"prime_design": PrimeDesign,
}
table = tables.get(source)
if not table:
return jsonify({"message": "No such table"}), 404
columns = table_columns(table)
where = None
for i in ["gene", "dst", "src"]:
value = request.args.get(i)
if value:
if where is None:
where = (SQL(i) == value)
else:
where = (where) & (SQL(i) == value)
for i in ["pbs_len", "rtt_len"]:
value = default_value(request.args.get(i), 0)
if value:
if where is None:
where = (SQL(i) <= value)
else:
where = (where) & (SQL(i) <= value)
query = table.select().where(where)
total = query.count()
order_by = request.args.get("order_by")
if order_by and order_by in columns:
order = request.args.get("order", "asc")
if "desc" in order:
query = query.order_by(SQL(order_by).desc())
else:
query = query.order_by(SQL(order_by))
else:
query = query.order_by(table.gene, table.aa, table.src, table.dst)
offset = default_value(request.args.get("offset"), 1)
if offset <= 0:
offset = 1
length = default_value(request.args.get("length"), 1)
if length > 200:
length = 200
query = query.offset((int(offset) - 1) * length).limit(int(length))
print(query.sql())
return jsonify({
"data": [x for x in query.dicts()],
"total": total,
"offset": offset,
"length": length,
})
def main(host: str="0.0.0.0", port=5555):
app.run(host=host, port=port, threaded=True, debug=True)
if __name__ == "__main__":
from fire import Fire
Fire({
"insert": insert,
"index": index,
"server": main
})

View File

@@ -0,0 +1,9 @@
[project]
name = "interactive"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"peewee>=3.19.0",
]

53
merge_results.py Normal file
View File

@@ -0,0 +1,53 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import gzip
import polars as pd
from glob import glob
from tqdm import tqdm
from multiprocessing import Pool
def read_file(args):
path, nicking = args
if os.path.getsize(path) < 1:
return None
for i in ["FANCD2", "BRIP1", "RAD51C", "FABCI", "FANCA"]:
if path.startswith(i):
return None
try:
df = pd.read_csv(path)
if nicking:
key = os.path.basename(path).split("_nicking")[0]
df = df.with_columns(sequence_name=pd.lit(key))
except Exception:
print(path)
return None
if "low_conf" not in path:
df = df.with_columns(conf=pd.lit("high"))
else:
df = df.with_columns(conf=pd.lit("low"))
return df
def main(indir, output, nicking=False):
print(indir, output, nicking)
fs = glob(os.path.join(indir, "*"))
with Pool(6) as p:
dfs = list(tqdm(p.imap(read_file, [[x, nicking] for x in fs]), total=len(fs)))
df = pd.concat([x for x in dfs if x is not None])
with gzip.open(output, "w+") as w:
df.write_csv(w)
if __name__ == '__main__':
from fire import Fire
Fire(main)

113
select_primedesign.py Normal file
View File

@@ -0,0 +1,113 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import csv
import gzip
import random
import pandas as pd
from tqdm import tqdm
seed = 42
total_codons = {
"丙氨酸": ["GCT", "GCC", "GCA", "GCG"],
"精氨酸": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"],
"天冬酰胺": ["AAT", "AAC"],
"天冬氨酸": ["GAT", "GAC"],
"半胱氨酸": ["TGT", "TGC"],
"谷氨酰胺": ["CAA", "CAG"],
"谷氨酸": ["GAA", "GAG"],
"甘氨酸": ["GGT", "GGC", "GGA", "GGG"],
"组氨酸": ["CAT", "CAC"],
"异亮氨酸": ["ATT", "ATC", "ATA"],
"亮氨酸": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"],
"赖氨酸": ["AAA", "AAG"],
"甲硫氨酸": ["ATG"],
"苯丙氨酸": ["TTT", "TTC"],
"脯氨酸": ["CCT", "CCC", "CCA", "CCG"],
"丝氨酸": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"],
"苏氨酸": ["ACT", "ACC", "ACA", "ACG"],
"色氨酸": ["TGG"],
"酪氨酸": ["TAT", "TAC"],
"缬氨酸": ["GTT", "GTC", "GTA", "GTG"],
"终止密码子": ["TAA", "TAG", "TGA"]
}
def load_finished(ref):
df = pd.read_excel(ref)
genes = {}
for _, row in df.iterrows():
gene = row["gene"]
pos = row["aa_pos"]
if gene not in genes:
genes[gene] = set()
genes[gene].add(str(pos))
return genes
def reader(path):
with gzip.open(path, "rt") as r:
dict_reader = csv.DictReader(r)
for row in tqdm(dict_reader):
yield row
def filter_(args):
finished, path, output = args
data = {}
for row in reader(path):
# 提取各种id
key = row['Target_name'].split("_")
gene = key[0]
pos = key[1].replace("AA", "")
dst = key[-1]
#
uid = f"{key}_{pos}_{dst}"
for k, codons in total_codons.items():
if dst in codons:
if gene in finished and pos not in finished[gene]:
row["dst"] = k
row["aa_pos"] = pos
row["gene"] = gene
if uid not in data:
data[uid] = []
data[uid].append(row)
dict_writer = None
with gzip.open(output, "wt+") as w:
for _, lines in tqdm(data.items()):
if len(lines) > 2:
random.seed(seed)
lines = random.sample(lines, 2)
if dict_writer is None:
for row in lines:
dict_writer = csv.DictWriter(w, fieldnames=row.keys())
dict_writer.writeheader()
break
# 写入数据行
dict_writer.writerows(lines)
def main(ref, indir, outdir):
finished = load_finished(ref)
filter_([finished, indir, outdir])
if __name__ == '__main__':
from fire import Fire
Fire(main)