提交饱和编辑的相关设计,及检验代码
This commit is contained in:
301
.gitignore
vendored
Normal file
301
.gitignore
vendored
Normal file
@@ -0,0 +1,301 @@
|
|||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
pip-wheel-metadata/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
.python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
logs
|
||||||
|
*.log
|
||||||
|
npm-debug.log*
|
||||||
|
yarn-debug.log*
|
||||||
|
yarn-error.log*
|
||||||
|
lerna-debug.log*
|
||||||
|
.pnpm-debug.log*
|
||||||
|
|
||||||
|
# Diagnostic reports (https://nodejs.org/api/report.html)
|
||||||
|
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
|
||||||
|
|
||||||
|
# Runtime data
|
||||||
|
pids
|
||||||
|
*.pid
|
||||||
|
*.seed
|
||||||
|
*.pid.lock
|
||||||
|
|
||||||
|
# Directory for instrumented libs generated by jscoverage/JSCover
|
||||||
|
lib-cov
|
||||||
|
|
||||||
|
# Coverage directory used by tools like istanbul
|
||||||
|
coverage
|
||||||
|
*.lcov
|
||||||
|
|
||||||
|
# nyc test coverage
|
||||||
|
.nyc_output
|
||||||
|
|
||||||
|
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
|
||||||
|
.grunt
|
||||||
|
|
||||||
|
# Bower dependency directory (https://bower.io/)
|
||||||
|
bower_components
|
||||||
|
|
||||||
|
# node-waf configuration
|
||||||
|
.lock-wscript
|
||||||
|
|
||||||
|
# Compiled binary addons (https://nodejs.org/api/addons.html)
|
||||||
|
build/Release
|
||||||
|
|
||||||
|
# Dependency directories
|
||||||
|
node_modules/
|
||||||
|
jspm_packages/
|
||||||
|
|
||||||
|
# Snowpack dependency directory (https://snowpack.dev/)
|
||||||
|
web_modules/
|
||||||
|
|
||||||
|
# TypeScript cache
|
||||||
|
*.tsbuildinfo
|
||||||
|
|
||||||
|
# Optional npm cache directory
|
||||||
|
.npm
|
||||||
|
|
||||||
|
# Optional eslint cache
|
||||||
|
.eslintcache
|
||||||
|
|
||||||
|
# Optional stylelint cache
|
||||||
|
.stylelintcache
|
||||||
|
|
||||||
|
# Microbundle cache
|
||||||
|
.rpt2_cache/
|
||||||
|
.rts2_cache_cjs/
|
||||||
|
.rts2_cache_es/
|
||||||
|
.rts2_cache_umd/
|
||||||
|
|
||||||
|
# Optional REPL history
|
||||||
|
.node_repl_history
|
||||||
|
|
||||||
|
# Output of 'npm pack'
|
||||||
|
*.tgz
|
||||||
|
|
||||||
|
# Yarn Integrity file
|
||||||
|
.yarn-integrity
|
||||||
|
|
||||||
|
# dotenv environment variable files
|
||||||
|
.env
|
||||||
|
.env.development.local
|
||||||
|
.env.test.local
|
||||||
|
.env.production.local
|
||||||
|
.env.local
|
||||||
|
|
||||||
|
# parcel-bundler cache (https://parceljs.org/)
|
||||||
|
.cache
|
||||||
|
.parcel-cache
|
||||||
|
|
||||||
|
# Next.js build output
|
||||||
|
.next
|
||||||
|
out
|
||||||
|
|
||||||
|
# Nuxt.js build / generate output
|
||||||
|
.nuxt
|
||||||
|
dist
|
||||||
|
|
||||||
|
# Gatsby files
|
||||||
|
.cache/
|
||||||
|
# Comment in the public line in if your project uses Gatsby and not Next.js
|
||||||
|
# https://nextjs.org/blog/next-9-1#public-directory-support
|
||||||
|
# public
|
||||||
|
|
||||||
|
# vuepress build output
|
||||||
|
.vuepress/dist
|
||||||
|
|
||||||
|
# vuepress v2.x temp and cache directory
|
||||||
|
.temp
|
||||||
|
.cache
|
||||||
|
|
||||||
|
# Docusaurus cache and generated files
|
||||||
|
.docusaurus
|
||||||
|
|
||||||
|
# Serverless directories
|
||||||
|
.serverless/
|
||||||
|
|
||||||
|
# FuseBox cache
|
||||||
|
.fusebox/
|
||||||
|
|
||||||
|
# DynamoDB Local files
|
||||||
|
.dynamodb/
|
||||||
|
|
||||||
|
# TernJS port file
|
||||||
|
.tern-port
|
||||||
|
|
||||||
|
# Stores VSCode versions used for testing VSCode extensions
|
||||||
|
.vscode-test
|
||||||
|
|
||||||
|
# yarn v2
|
||||||
|
.yarn/cache
|
||||||
|
.yarn/unplugged
|
||||||
|
.yarn/build-state.yml
|
||||||
|
.yarn/install-state.gz
|
||||||
|
.pnp.*
|
||||||
|
Footer
|
||||||
|
© 2022 GitHub, Inc.
|
||||||
|
Footer navigation
|
||||||
|
Terms
|
||||||
|
|
||||||
|
|
||||||
|
.idea/
|
||||||
|
*.png
|
||||||
|
*.pdf
|
||||||
|
|
||||||
|
Mus_musculus.GRCm38.101.gtf.gz*
|
||||||
|
hg38.chr19.gtf*
|
||||||
|
Homo_sapiens.GRCh38.101.sorted.gtf.gz*
|
||||||
|
SRX8994511.corrected_reads.bed.gz*
|
||||||
|
SRX8994511_sample.bed.gz*
|
||||||
|
tmp
|
||||||
|
new_run.sh
|
||||||
|
example.sorted.sorted.gtf
|
||||||
|
example.sorted.sorted.gtf.gz
|
||||||
|
example.sorted.sorted.gtf.gz.tbi
|
||||||
|
|
||||||
|
.vscode/
|
||||||
|
|
||||||
|
docs/_*
|
||||||
|
plots/
|
||||||
|
conda_build.py
|
||||||
|
run.sh
|
||||||
|
param.py
|
||||||
|
.DS_Store
|
||||||
|
|
||||||
|
ui/
|
||||||
|
*.rds
|
||||||
|
*.zip
|
||||||
|
example/
|
||||||
|
recipes/
|
||||||
|
AppDir/
|
||||||
|
appimage-build/
|
||||||
|
*_issue
|
||||||
|
.ruff_cache
|
||||||
|
*.csv
|
||||||
89
better_input_seq.py
Normal file
89
better_input_seq.py
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
import gzip
|
||||||
|
from glob import glob
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def load_left_aa(ref):
|
||||||
|
df = pd.read_csv(ref)
|
||||||
|
|
||||||
|
aa = set()
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
gene, pos = row["gene"], row["aa_pos"]
|
||||||
|
aa.add(f"{gene}_AA{pos}")
|
||||||
|
|
||||||
|
return aa
|
||||||
|
|
||||||
|
|
||||||
|
def reader(path):
|
||||||
|
with gzip.open(path, "rt") as r:
|
||||||
|
dict_reader = csv.DictReader(r)
|
||||||
|
|
||||||
|
for row in tqdm(dict_reader):
|
||||||
|
yield row
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def process_seq(sequence: str):
|
||||||
|
|
||||||
|
before, codon = sequence.split("(")
|
||||||
|
codon, after = codon.split(")")
|
||||||
|
|
||||||
|
src, dst = codon.split("/")
|
||||||
|
|
||||||
|
mismatch_codon = sum([x != y for x, y in zip(src, dst)])
|
||||||
|
if mismatch_codon == 1:
|
||||||
|
if src[:2] == dst[:2]:
|
||||||
|
before += src[:2]
|
||||||
|
return f"{before}({src[-1]}/{dst[-1]}){after}"
|
||||||
|
if src[1:] == dst[1:]:
|
||||||
|
after = src[1:] + after
|
||||||
|
return f"{before}({src[0]}/{dst[0]}){after}"
|
||||||
|
|
||||||
|
before += src[0]
|
||||||
|
after = src[-1] + after
|
||||||
|
return f"{before}({src[1]}/{dst[1]}){after}"
|
||||||
|
elif mismatch_codon == 2:
|
||||||
|
if src[0] == dst[0]:
|
||||||
|
before = before + src[0]
|
||||||
|
return f"{before}({src[1:]}/{dst[1:]}){after}"
|
||||||
|
if src[-1] == dst[-1]:
|
||||||
|
after = src[-1] + after
|
||||||
|
return f"{before}({src[:2]}/{dst[:2]}){after}"
|
||||||
|
return None
|
||||||
|
# return sequence
|
||||||
|
|
||||||
|
|
||||||
|
def main(ref, infile, outfile):
|
||||||
|
ref = load_left_aa(ref)
|
||||||
|
|
||||||
|
data = []
|
||||||
|
for file in glob(infile):
|
||||||
|
for row in reader(file):
|
||||||
|
seq_name = row["sequence_name"].split("_")[:2]
|
||||||
|
seq_name = "_".join(seq_name)
|
||||||
|
|
||||||
|
if seq_name in ref:
|
||||||
|
row["editseq"] = process_seq(row["editseq"])
|
||||||
|
|
||||||
|
if row["editseq"]:
|
||||||
|
row.pop("strategy")
|
||||||
|
row.pop("mutation_type")
|
||||||
|
data.append(row)
|
||||||
|
|
||||||
|
with gzip.open(outfile, "wt+") as w:
|
||||||
|
dict_writer = csv.DictWriter(w, fieldnames=data[0].keys())
|
||||||
|
dict_writer.writeheader()
|
||||||
|
|
||||||
|
# 写入数据行
|
||||||
|
dict_writer.writerows(data)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from fire import Fire
|
||||||
|
Fire(main)
|
||||||
222
combine_pridict2_primedesign.py
Normal file
222
combine_pridict2_primedesign.py
Normal file
@@ -0,0 +1,222 @@
|
|||||||
|
import csv
|
||||||
|
import gzip
|
||||||
|
import logging
|
||||||
|
from typing import Set, Dict, Iterator
|
||||||
|
|
||||||
|
# 配置日志记录
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def read_excluded_sequences(file_path: str, sequence_column: str = 'sequence_name') -> Set[str]:
|
||||||
|
"""
|
||||||
|
从CSV文件中读取需要排除的序列名称
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: CSV文件路径
|
||||||
|
sequence_column: 序列名称所在的列名
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
排除序列名称的集合
|
||||||
|
"""
|
||||||
|
excluded = set()
|
||||||
|
try:
|
||||||
|
with gzip.open(file_path, 'rt', newline='', encoding='utf-8') as f:
|
||||||
|
reader = csv.DictReader(f)
|
||||||
|
|
||||||
|
# 检查必要的列是否存在
|
||||||
|
if sequence_column not in reader.fieldnames:
|
||||||
|
raise ValueError(f"CSV文件中缺少'{sequence_column}'列")
|
||||||
|
|
||||||
|
# 逐行读取,收集序列名称
|
||||||
|
for row in reader:
|
||||||
|
sequence_name = row.get(sequence_column)
|
||||||
|
if sequence_name:
|
||||||
|
excluded.add(sequence_name.strip())
|
||||||
|
|
||||||
|
logger.info(f"从 {file_path} 读取了 {len(excluded)} 个排除序列")
|
||||||
|
return excluded
|
||||||
|
|
||||||
|
except FileNotFoundError:
|
||||||
|
logger.error(f"文件不存在: {file_path}")
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"读取文件 {file_path} 时出错: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def validate_csv_headers(file_path: str, expected_headers: Set[str], gzipped: bool = True) -> bool:
|
||||||
|
"""
|
||||||
|
验证CSV文件是否包含必需的列头
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: 文件路径
|
||||||
|
expected_headers: 必需的列头集合
|
||||||
|
gzipped: 是否为gzip压缩文件
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
验证是否通过
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if gzipped:
|
||||||
|
opener = gzip.open
|
||||||
|
mode = 'rt'
|
||||||
|
else:
|
||||||
|
opener = open
|
||||||
|
mode = 'r'
|
||||||
|
|
||||||
|
with opener(file_path, mode, newline='', encoding='utf-8') as f:
|
||||||
|
# 读取第一行作为表头
|
||||||
|
reader = csv.reader(f)
|
||||||
|
headers = next(reader, None)
|
||||||
|
|
||||||
|
if not headers:
|
||||||
|
logger.error(f"文件 {file_path} 没有表头或为空")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 检查必需列是否存在
|
||||||
|
headers_set = set(headers)
|
||||||
|
missing_headers = expected_headers - headers_set
|
||||||
|
|
||||||
|
if missing_headers:
|
||||||
|
logger.error(f"文件 {file_path} 缺少必需列: {missing_headers}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
logger.info(f"文件 {file_path} 表头验证通过")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"验证文件 {file_path} 表头时出错: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def process_prime_design(primedesign_path: str, excluded_sequences: Set[str],
|
||||||
|
output_path: str, batch_size: int = 10000) -> int:
|
||||||
|
"""
|
||||||
|
处理PrimeDesign文件,过滤排除序列
|
||||||
|
|
||||||
|
Args:
|
||||||
|
primedesign_path: PrimeDesign文件路径(gzip压缩)
|
||||||
|
excluded_sequences: 需要排除的序列集合
|
||||||
|
output_path: 输出文件路径
|
||||||
|
batch_size: 批量写入大小
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
处理的行数
|
||||||
|
"""
|
||||||
|
processed_count = 0
|
||||||
|
written_count = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
with gzip.open(primedesign_path, 'rt', newline='', encoding='utf-8') as input_file, \
|
||||||
|
gzip.open(output_path, 'wt', newline='', encoding='utf-8') as output_file:
|
||||||
|
|
||||||
|
# 创建CSV读写器
|
||||||
|
reader = csv.DictReader(input_file)
|
||||||
|
writer = csv.DictWriter(output_file, fieldnames=reader.fieldnames)
|
||||||
|
|
||||||
|
# 写入表头
|
||||||
|
writer.writeheader()
|
||||||
|
|
||||||
|
# 逐行处理数据
|
||||||
|
for row in reader:
|
||||||
|
processed_count += 1
|
||||||
|
|
||||||
|
try:
|
||||||
|
gRNA_type = row.get('gRNA_type', '').strip()
|
||||||
|
target_name = row.get('Target_name', '').strip()
|
||||||
|
|
||||||
|
# 只处理pegRNA,且不在排除列表中
|
||||||
|
if gRNA_type == "pegRNA" and target_name not in excluded_sequences:
|
||||||
|
writer.writerow(row)
|
||||||
|
written_count += 1
|
||||||
|
|
||||||
|
# 定期刷新缓冲区
|
||||||
|
if written_count % batch_size == 0:
|
||||||
|
output_file.flush()
|
||||||
|
|
||||||
|
except KeyError as e:
|
||||||
|
logger.warning(f"第 {processed_count} 行缺少字段 {e},跳过该行")
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"处理第 {processed_count} 行时出错: {e},跳过该行")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 最终刷新缓冲区
|
||||||
|
output_file.flush()
|
||||||
|
|
||||||
|
logger.info(f"处理完成: 处理了 {processed_count} 行,写入了 {written_count} 行")
|
||||||
|
return written_count
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"处理PrimeDesign文件时出错: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def main(pegrna: str, primedesign: str, output: str) -> None:
|
||||||
|
"""
|
||||||
|
主函数:处理PrimeDesign输出文件,过滤排除序列
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pegrna: 包含需要排除的序列的CSV文件
|
||||||
|
primedesign: PrimeDesign输出文件(gzip压缩)
|
||||||
|
output: 输出文件前缀
|
||||||
|
"""
|
||||||
|
logger.info("开始处理PrimeDesign文件")
|
||||||
|
|
||||||
|
# 步骤1: 验证输入文件格式
|
||||||
|
logger.info("验证输入文件格式...")
|
||||||
|
|
||||||
|
# 验证pegrna文件格式
|
||||||
|
if not validate_csv_headers(pegrna, {'sequence_name'}, gzipped=True):
|
||||||
|
raise ValueError("pegrna文件格式验证失败")
|
||||||
|
|
||||||
|
# 验证primedesign文件格式
|
||||||
|
if not validate_csv_headers(primedesign, {'gRNA_type', 'Target_name'}, gzipped=True):
|
||||||
|
raise ValueError("primedesign文件格式验证失败")
|
||||||
|
|
||||||
|
# 步骤2: 读取排除序列
|
||||||
|
logger.info("读取需要排除的序列...")
|
||||||
|
excluded_sequences = read_excluded_sequences(pegrna)
|
||||||
|
|
||||||
|
# 步骤3: 处理PrimeDesign文件
|
||||||
|
logger.info("开始处理PrimeDesign文件...")
|
||||||
|
output_path = f"{output}_PrimeDesign_pegRNA.csv.gz"
|
||||||
|
|
||||||
|
written_count = process_prime_design(
|
||||||
|
primedesign_path=primedesign,
|
||||||
|
excluded_sequences=excluded_sequences,
|
||||||
|
output_path=output_path,
|
||||||
|
batch_size=10000
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"输出文件已保存: {output_path}")
|
||||||
|
logger.info(f"总共写入了 {written_count} 条pegRNA记录")
|
||||||
|
|
||||||
|
|
||||||
|
def safe_main(pegrna: str, primedesign: str, output: str) -> None:
|
||||||
|
"""
|
||||||
|
带错误处理的主函数包装器
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pegrna: 包含需要排除的序列的CSV文件
|
||||||
|
primedesign: PrimeDesign输出文件(gzip压缩)
|
||||||
|
output: 输出文件前缀
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
main(pegrna, primedesign, output)
|
||||||
|
logger.info("程序执行成功!")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"程序执行失败: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
# 如果直接运行此脚本
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys
|
||||||
|
|
||||||
|
pegrna_file = sys.argv[1]
|
||||||
|
primedesign_file = sys.argv[2]
|
||||||
|
output_prefix = sys.argv[3]
|
||||||
|
|
||||||
|
safe_main(pegrna_file, primedesign_file, output_prefix)
|
||||||
450
design/main.py
Normal file
450
design/main.py
Normal file
@@ -0,0 +1,450 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from glob import glob
|
||||||
|
import requests as rq
|
||||||
|
import pandas as pd
|
||||||
|
from loguru import logger
|
||||||
|
import numpy as np
|
||||||
|
from src.mutation import design_mutations_for_orf
|
||||||
|
from src.reader import (extract_orf_sequence, get_cds_for_gene,
|
||||||
|
load_uniprot_region, read_gtf, Region)
|
||||||
|
from src.liftover import convert_interval
|
||||||
|
from src.snp import decode_snp, generate_sequences_with_combinations
|
||||||
|
import itertools
|
||||||
|
from src.editseq import run_analysis
|
||||||
|
|
||||||
|
|
||||||
|
# 清除默认的 handler
|
||||||
|
logger.remove()
|
||||||
|
|
||||||
|
# 添加一个只输出 INFO 及以上级别日志的 sink(如控制台)
|
||||||
|
# logger.add(level="INFO")
|
||||||
|
logger.add(
|
||||||
|
sys.stderr,
|
||||||
|
colorize=True,
|
||||||
|
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>",
|
||||||
|
# <cyan>{name}</cyan>: <cyan>{function}</cyan>: <cyan>{line}</cyan>
|
||||||
|
level="INFO"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def split_regions(cds):
|
||||||
|
u"""
|
||||||
|
切分原本的cds为3bp的氨基酸reigon
|
||||||
|
测试用例
|
||||||
|
14:103698801-103699017
|
||||||
|
14:103699133-103699179
|
||||||
|
14:103699364-103699576
|
||||||
|
14:103703173-103703327
|
||||||
|
14:103707003-103707215
|
||||||
|
14:103708522-103708659
|
||||||
|
14:103711033-103711087
|
||||||
|
"""
|
||||||
|
regions = []
|
||||||
|
cds = sorted(cds, key=lambda x: (x.chrom, x.start, x.end))
|
||||||
|
aa_codon_len = 3
|
||||||
|
start = 0
|
||||||
|
|
||||||
|
for x in cds:
|
||||||
|
# 如果start为0,则直接从目前的区域开始
|
||||||
|
if start == 0:
|
||||||
|
start = x.start
|
||||||
|
elif start < 0:
|
||||||
|
# 如果start为负值,说明上一个cds并不能完整划分为不同的aa,
|
||||||
|
# 因此,需要单独将起始的小区域单独写出来
|
||||||
|
regions.append(Region(x.chrom, x.start, x.start - start, kind="start"))
|
||||||
|
regions[-1].addition = x
|
||||||
|
start = x.start - start
|
||||||
|
|
||||||
|
while start + aa_codon_len <= x.end:
|
||||||
|
# 记录下是否跨边界,以及跨的是哪一个边界
|
||||||
|
code = "regular"
|
||||||
|
if start == x.start:
|
||||||
|
code = "start"
|
||||||
|
elif start + aa_codon_len == x.end:
|
||||||
|
code = "end"
|
||||||
|
|
||||||
|
regions.append(Region(x.chrom, start, start + aa_codon_len, kind=code))
|
||||||
|
regions[-1].addition = x
|
||||||
|
start += aa_codon_len
|
||||||
|
|
||||||
|
if start < x.end:
|
||||||
|
# 如果是跨到end边界上了,那么就记录跨的边界
|
||||||
|
regions.append(Region(x.chrom, start, x.end, kind="end"))
|
||||||
|
regions[-1].addition = x
|
||||||
|
start = start - x.end + 1
|
||||||
|
else:
|
||||||
|
# 如果没有,则把start的指针归零
|
||||||
|
start = 0
|
||||||
|
|
||||||
|
return regions
|
||||||
|
|
||||||
|
|
||||||
|
def download_uniprot_region(protein, output):
|
||||||
|
resp = output.replace(".tsv", ".json")
|
||||||
|
url = f"https://www.ebi.ac.uk/proteins/api/coordinates?accession={protein}"
|
||||||
|
|
||||||
|
if os.path.exists(resp):
|
||||||
|
with open(resp, "r") as r:
|
||||||
|
resp = json.load(r)
|
||||||
|
else:
|
||||||
|
resp = rq.get(url, headers={"Accept": "application/json"})
|
||||||
|
resp = resp.json()
|
||||||
|
|
||||||
|
with open(output.replace(".tsv", ".json"), "w+") as w:
|
||||||
|
json.dump(resp, w, indent=4)
|
||||||
|
|
||||||
|
if not resp[0]["name"].endswith("HUMAN"):
|
||||||
|
raise ValueError(f"protein is not human")
|
||||||
|
|
||||||
|
__chroms__ = [str(x) for x in range(1, 23)] + ["chr" + str(x) for x in range(1, 23)] + ["X", "Y", "chrX", "chrY"]
|
||||||
|
|
||||||
|
with open(output, "w+") as w:
|
||||||
|
w.write(f"#{url}\n")
|
||||||
|
for coord in resp[0]["gnCoordinate"]:
|
||||||
|
chromosome = coord["genomicLocation"]["chromosome"]
|
||||||
|
|
||||||
|
if chromosome not in __chroms__:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for row in coord["genomicLocation"]["exon"]:
|
||||||
|
genome = row["genomeLocation"]
|
||||||
|
genome = str(genome["begin"]["position"]) + "-" + str(genome["end"]["position"])
|
||||||
|
|
||||||
|
protein = row["proteinLocation"]
|
||||||
|
|
||||||
|
if "end" not in protein and "position" in protein:
|
||||||
|
protein = [str(protein["position"]["position"]), "-", str(protein["position"]["position"])]
|
||||||
|
else:
|
||||||
|
protein = [str(protein["begin"]["position"]), "-", str(protein["end"]["position"])]
|
||||||
|
|
||||||
|
row = f"{chromosome}:{genome}\t{'\t'.join(protein)}"
|
||||||
|
w.write(row + "\n")
|
||||||
|
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
def get_aa_coords(genes, output):
|
||||||
|
os.makedirs(output, exist_ok=True)
|
||||||
|
df = pd.read_excel(genes)
|
||||||
|
# df = df.loc[df["Batch"] == 1, :]
|
||||||
|
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
gene_name = row[1]
|
||||||
|
|
||||||
|
url = f"https://rest.uniprot.org/uniprotkb/search?query=gene:{gene_name}+AND+organism_id:9606+AND+reviewed:true&format=json"
|
||||||
|
resp = rq.get(url)
|
||||||
|
|
||||||
|
for row in resp.json().get("results", []):
|
||||||
|
if "HUMAN" in row["uniProtkbId"]:
|
||||||
|
priority = row["primaryAccession"]
|
||||||
|
download_uniprot_region(priority, os.path.join(output, f"{gene_name}_{priority}.tsv"))
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def adjust_cross_border_region(row):
|
||||||
|
start = row.start
|
||||||
|
end = row.end
|
||||||
|
if len(row) < 3 and "cross" in row.kind:
|
||||||
|
if row.kind == "cross_start":
|
||||||
|
start = end - 3
|
||||||
|
else:
|
||||||
|
end = start + 3
|
||||||
|
return f"{row.chrom}:{start}-{end}"
|
||||||
|
return str(row)
|
||||||
|
|
||||||
|
|
||||||
|
def design_by_aa(genes, fasta, output, stop_codon = False):
|
||||||
|
u""" 根据氨基酸设计错配的位点和错配规则 """
|
||||||
|
df = []
|
||||||
|
for gene in glob(os.path.join(genes, "*.tsv")):
|
||||||
|
logger.info(f"开始设计突变 {gene}...")
|
||||||
|
key = os.path.basename(gene).split(".")[0]
|
||||||
|
|
||||||
|
# 读取已有的区域
|
||||||
|
cds = load_uniprot_region(gene)
|
||||||
|
|
||||||
|
# 按照氨基酸位置划分
|
||||||
|
cds = split_regions(cds)
|
||||||
|
|
||||||
|
if not cds:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 提取序列
|
||||||
|
cds = extract_orf_sequence(fasta, cds, half_open=True)
|
||||||
|
|
||||||
|
for idx, x in enumerate(cds):
|
||||||
|
for strategy in ["3N"]:
|
||||||
|
results = design_mutations_for_orf(x.sequence, strategy=strategy)
|
||||||
|
for res in results:
|
||||||
|
for var in res["variants"]:
|
||||||
|
if var == res["original_codon"]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 如果1个bp在起点,则说明2bp在前边end,该位点的前2bp必须与记录的内含子相同
|
||||||
|
if "cross_start" == x.kind and len(x) == 1 and var[:2] != x.sequence[:2]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 如果2bp在起点,则说明1bp在前边end,则该位点的第一个碱基必须为内含子相同位点
|
||||||
|
elif "cross_start" == x.kind and len(x) == 2 and var[0] != x.sequence[0]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 如果1bp在end,则说明2bp在后边,则该位点的2bp位点必须与内含子相同
|
||||||
|
elif "cross_end" == x.kind and len(x) == 1 and var[1:] != x.sequence[1:]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 如果1bp在end,则说明1bp在后边,则该位点的1bp位点必须为C或G
|
||||||
|
elif "cross_end" == x.kind and len(x) == 2 and var[-1] not in ["C", "G"]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
row = [key, str(x.addition), idx+1, str(x), adjust_cross_border_region(x), x.kind, strategy, res["original_codon"], var]
|
||||||
|
df.append(row)
|
||||||
|
|
||||||
|
df = pd.DataFrame(df)
|
||||||
|
df.columns = ["gene", "cds_region", "aa_index", "aa_region", "region_with_intron", "cross_cds_border", "strategy",
|
||||||
|
"origial_code", "mutation_code"]
|
||||||
|
strategy = []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
match = np.sum([x == y for x, y in zip(row["origial_code"], row["mutation_code"])])
|
||||||
|
strategy.append(f"{3-match}N")
|
||||||
|
|
||||||
|
df["strategy"] = strategy
|
||||||
|
|
||||||
|
if stop_codon:
|
||||||
|
df = df[df["mutation_code"].isin(["TAA", "TAG", "TGA"])]
|
||||||
|
|
||||||
|
df.to_csv(output, index = False)
|
||||||
|
|
||||||
|
|
||||||
|
def design_by_snp(snp_info, targets, genes, fasta, fasta_hg38, output):
|
||||||
|
logger.info("读取染色体")
|
||||||
|
chroms = {}
|
||||||
|
starts = {}
|
||||||
|
for gene in glob(os.path.join(genes, "*.tsv")):
|
||||||
|
key = os.path.basename(gene).split(".")[0]
|
||||||
|
cds = load_uniprot_region(gene)
|
||||||
|
cds = sorted(cds, key=lambda x:[x.chrom, x.start, x.end])
|
||||||
|
chroms[key] = cds[0].chrom
|
||||||
|
starts[key] = cds[0].start
|
||||||
|
|
||||||
|
logger.info(f"读取snp的信息:{snp_info}")
|
||||||
|
all_sheets = pd.read_excel(snp_info, sheet_name=None)
|
||||||
|
|
||||||
|
# 遍历所有工作表
|
||||||
|
res = {}
|
||||||
|
for sheet_name, df in all_sheets.items():
|
||||||
|
temp = {}
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
cdna = row["DNA change (cDNA) "]
|
||||||
|
hg38 = row["DNA change (genomic) (hg19) "]
|
||||||
|
temp[cdna] = hg38
|
||||||
|
for sheet in re.split(r"[\((\s\))]", sheet_name):
|
||||||
|
res[sheet] = temp
|
||||||
|
|
||||||
|
print(res.keys())
|
||||||
|
|
||||||
|
logger.info(f"读取目标:{targets}")
|
||||||
|
df = pd.read_excel(targets, sheet_name=2)
|
||||||
|
|
||||||
|
with open(output, "w+") as w:
|
||||||
|
w.write(",".join(["gene", "cdna code", "genomic code", "mutation_region", "version", "original_codon", "mutation_code"]) + "\n")
|
||||||
|
for column in df.columns:
|
||||||
|
if "Unnamed" in column:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for code in df[column]:
|
||||||
|
if not isinstance(code, str) and math.isnan(code):
|
||||||
|
continue
|
||||||
|
|
||||||
|
genomic_code = res.get(column, {}).get(code)
|
||||||
|
|
||||||
|
if genomic_code:
|
||||||
|
sites, rule = decode_snp(genomic_code)
|
||||||
|
elif str(code).startswith("c."):
|
||||||
|
sites, rule = decode_snp(code, ref_start=starts["FANCD2" if column == "FAND2" else column])
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
region = Region(chroms["FANCD2" if column == "FAND2" else column], start=sites[0], end=sites[-1])
|
||||||
|
|
||||||
|
hg38 = False
|
||||||
|
if genomic_code:
|
||||||
|
region = extract_orf_sequence(fasta, [region])[0]
|
||||||
|
elif str(code).startswith("c."):
|
||||||
|
hg38 = True
|
||||||
|
region = extract_orf_sequence(fasta_hg38, [region])[0]
|
||||||
|
|
||||||
|
original, replacement = "", ""
|
||||||
|
if ">" in rule:
|
||||||
|
original, replacement = rule.split(">")
|
||||||
|
original = region.sequence
|
||||||
|
elif rule == "dup":
|
||||||
|
original = region.sequence
|
||||||
|
replacement = original * 2
|
||||||
|
elif rule == "del":
|
||||||
|
original = region.sequence
|
||||||
|
replacement = ""
|
||||||
|
elif rule == "ins":
|
||||||
|
replacement = region.sequence
|
||||||
|
elif "delins" in rule:
|
||||||
|
original = region.sequence
|
||||||
|
replacement = rule.replace("delins", "")
|
||||||
|
elif "ins" in rule:
|
||||||
|
original = region.sequence
|
||||||
|
replacement = rule.replace("ins", "")
|
||||||
|
|
||||||
|
if not genomic_code:
|
||||||
|
genomic_code = ""
|
||||||
|
|
||||||
|
# 序列中所有N替换后的排列组合
|
||||||
|
for o, r in itertools.product(generate_sequences_with_combinations(original), generate_sequences_with_combinations(replacement)):
|
||||||
|
w.write(",".join([column, code.strip(), str(genomic_code).strip(), str(region), "hg38" if hg38 else "hg19", o, r]) + "\n")
|
||||||
|
|
||||||
|
# data = pd.DataFrame(data)
|
||||||
|
# data.columns = ["gene", "cdna code", "genomic code", "mutation_region", "original_codon", "mutation_code"]
|
||||||
|
# data.to_csv(output, index = False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def extract_fastq_seq(fastq: str, chrom, start, end):
|
||||||
|
import pysam
|
||||||
|
with pysam.FastaFile(fastq) as fh:
|
||||||
|
rec = fh.fetch(str(chrom), start, end)
|
||||||
|
# print(rec)
|
||||||
|
return rec
|
||||||
|
|
||||||
|
|
||||||
|
def decode_mutation(rule: str, sequence):
|
||||||
|
original, replacement = "", ""
|
||||||
|
if ">" in rule:
|
||||||
|
original, replacement = rule.split(">")
|
||||||
|
original = sequence
|
||||||
|
elif rule == "dup":
|
||||||
|
original = sequence
|
||||||
|
replacement = original * 2
|
||||||
|
elif rule == "del":
|
||||||
|
original = sequence
|
||||||
|
replacement = ""
|
||||||
|
elif rule == "ins":
|
||||||
|
replacement = sequence
|
||||||
|
elif "delins" in rule:
|
||||||
|
original = sequence
|
||||||
|
replacement = rule.replace("delins", "")
|
||||||
|
elif "ins" in rule:
|
||||||
|
original = sequence
|
||||||
|
replacement = rule.replace("ins", "")
|
||||||
|
return original, replacement
|
||||||
|
|
||||||
|
|
||||||
|
def design_by_hmgd(data, fasta, outfile):
|
||||||
|
import re
|
||||||
|
res = pd.read_csv(data)
|
||||||
|
# print(res.head())
|
||||||
|
|
||||||
|
# hgvs
|
||||||
|
# chromosome
|
||||||
|
# startCoord
|
||||||
|
# endCoord
|
||||||
|
|
||||||
|
data = []
|
||||||
|
for idx, row in res.iterrows():
|
||||||
|
|
||||||
|
key = row["gene"] + "_" + str(idx)
|
||||||
|
|
||||||
|
try:
|
||||||
|
seq = extract_fastq_seq(fasta, int(row["chromosome"]), row["startCoord"] - 1,row["endCoord"])
|
||||||
|
|
||||||
|
seq, replace = decode_mutation(row["hgvs"], seq)
|
||||||
|
|
||||||
|
if not seq:
|
||||||
|
continue
|
||||||
|
replace = re.sub(r"[\d_]", "", replace)
|
||||||
|
|
||||||
|
if "del" in replace:
|
||||||
|
replace = ""
|
||||||
|
|
||||||
|
print(key, seq, replace)
|
||||||
|
|
||||||
|
before = extract_fastq_seq(fasta, int(row["chromosome"]), row["startCoord"] - 1 - 100, row["startCoord"])
|
||||||
|
after = extract_fastq_seq(fasta, int(row["chromosome"]), row["endCoord"], row["endCoord"] + 100)
|
||||||
|
|
||||||
|
|
||||||
|
seq = f"{before}({seq}/{replace}){after}"
|
||||||
|
data.append({"sequence_name": key, "editseq": seq})
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
data = pd.DataFrame(data)
|
||||||
|
data.to_csv(outfile, index=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from fire import Fire
|
||||||
|
|
||||||
|
# get_aa_coords(
|
||||||
|
# "../metainfo/Cancer and blood disorder panels_v2.xlsx",
|
||||||
|
# "../gene_coords/batch2"
|
||||||
|
# )
|
||||||
|
|
||||||
|
# get_aa_coords(
|
||||||
|
# "../metainfo/DDR gene library in 2021 Cell.xlsx",
|
||||||
|
# "../gene_coords/positive"
|
||||||
|
# )
|
||||||
|
|
||||||
|
|
||||||
|
# # Fire({"aa": design_by_aa})
|
||||||
|
# design_by_aa(
|
||||||
|
# "../gene_coords/batch2",
|
||||||
|
# fasta="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
|
||||||
|
# output="../gene_aa_target_batch2.csv.gz"
|
||||||
|
# )
|
||||||
|
|
||||||
|
# design_by_aa(
|
||||||
|
# "../gene_coords/positive",
|
||||||
|
# fasta="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
|
||||||
|
# output="../gene_aa_target_positive.csv.gz",
|
||||||
|
# stop_codon = True
|
||||||
|
# )
|
||||||
|
|
||||||
|
# run_analysis(
|
||||||
|
# "../gene_aa_target_batch2.csv.gz",
|
||||||
|
# reference="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
|
||||||
|
# outdir="../../prediction/input/batch2"
|
||||||
|
# )
|
||||||
|
|
||||||
|
# run_analysis(
|
||||||
|
# "../gene_aa_target_positive.csv.gz",
|
||||||
|
# reference="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
|
||||||
|
# outdir="../../prediction/input/positive"
|
||||||
|
# )
|
||||||
|
|
||||||
|
# 生成snp结构,snp_info是整理完的snp信息
|
||||||
|
# targets是记录了需要处理的基因
|
||||||
|
# design_by_snp(
|
||||||
|
# snp_info="../metainfo/副本FA家族基因-20250829-DJJ_XD.xlsx",
|
||||||
|
# targets="../metainfo/实验计划.xlsx",
|
||||||
|
# output="gene_snp_target.csv",
|
||||||
|
# fasta="../ref/gencode/GRCh37.p13.genome.fa.gz",
|
||||||
|
# fasta_hg38="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
|
||||||
|
# genes="../gene_coords"
|
||||||
|
# )
|
||||||
|
|
||||||
|
# design_by_hmgd(
|
||||||
|
# "../metainfo/allmut.csv",
|
||||||
|
# fasta="../ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
|
||||||
|
# outfile="../../prediction/input/pos_v2.csv.gz"
|
||||||
|
# )
|
||||||
|
|
||||||
|
|
||||||
|
# url = "https://www.ebi.ac.uk/proteins/api/coordinates?accession=P21359-1"
|
||||||
|
# download_uniprot_region("Test", "P21359")
|
||||||
|
|
||||||
|
|
||||||
16
design/pyproject.toml
Normal file
16
design/pyproject.toml
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
[project]
|
||||||
|
name = "pgrna"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Add your description here"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.12"
|
||||||
|
dependencies = [
|
||||||
|
"biopython>=1.85",
|
||||||
|
"fire>=0.7.1",
|
||||||
|
"loguru>=0.7.3",
|
||||||
|
"openpyxl>=3.1.5",
|
||||||
|
"pandas>=2.3.3",
|
||||||
|
"pyfaidx>=0.9.0.3",
|
||||||
|
"pyliftover>=0.4.1",
|
||||||
|
"rich>=14.2.0",
|
||||||
|
]
|
||||||
233
design/src/editseq.py
Normal file
233
design/src/editseq.py
Normal file
@@ -0,0 +1,233 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
created by lanzl
|
||||||
|
modified by zym
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pyfaidx import Fasta, FetchError
|
||||||
|
|
||||||
|
HG19_FASTA_PATH = "/rawdata1/project/peRNA_design/ref/gencode/GRCh37.p13.genome.fa.gz"
|
||||||
|
HG38_FASTA_PATH = "/rawdata1/project/peRNA_design/ref/ensembl_115/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
|
||||||
|
|
||||||
|
|
||||||
|
def parse_region(region_str: str) -> tuple:
|
||||||
|
"""解析 'chrom:start-end' 格式的基因组区域,并确保染色体名带有 'chr' 前缀。"""
|
||||||
|
match = re.match(r"(\w+):(\d+)-(\d+)", region_str)
|
||||||
|
chrom, start, end = match.groups()
|
||||||
|
|
||||||
|
if not chrom.lower().startswith("chr"):
|
||||||
|
chrom = "chr" + chrom
|
||||||
|
|
||||||
|
return chrom, int(start), int(end)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_orf_sequence(genome: Fasta, chrom: str, start: int, end: int) -> str:
|
||||||
|
"""
|
||||||
|
从预加载的 Fasta 对象中提取序列(1-based inclusive)。
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
sequence = str(genome.get_seq(chrom, start, end))
|
||||||
|
return sequence.upper()
|
||||||
|
except (KeyError, FetchError) as e:
|
||||||
|
alt_chrom = chrom
|
||||||
|
if chrom.lower().startswith("chr"):
|
||||||
|
alt_chrom = chrom[3:]
|
||||||
|
|
||||||
|
if alt_chrom != chrom:
|
||||||
|
try:
|
||||||
|
sequence = str(genome.get_seq(alt_chrom, start, end))
|
||||||
|
return sequence.upper()
|
||||||
|
except (KeyError, FetchError) as inner_e:
|
||||||
|
raise FetchError(
|
||||||
|
f"Requested rname '{chrom}' (also tried '{alt_chrom}') does not exist in FASTA index."
|
||||||
|
) from inner_e
|
||||||
|
raise e
|
||||||
|
|
||||||
|
|
||||||
|
def generate_editseq(
|
||||||
|
original: str,
|
||||||
|
replacement: str,
|
||||||
|
region_str: str,
|
||||||
|
genome: Fasta,
|
||||||
|
flank_size: int = 100,
|
||||||
|
) -> str:
|
||||||
|
"""构造 EditSeq 序列字符串(突变位点 + 100bp 侧翼)。"""
|
||||||
|
chrom, mut_start, mut_end = parse_region(region_str)
|
||||||
|
|
||||||
|
# 计算侧翼坐标
|
||||||
|
upstream_start = mut_start - flank_size
|
||||||
|
upstream_end = mut_start - 1
|
||||||
|
downstream_start = mut_end + 1
|
||||||
|
downstream_end = mut_end + flank_size
|
||||||
|
|
||||||
|
# 提取侧翼序列
|
||||||
|
upstream_flank = extract_orf_sequence(genome, chrom, upstream_start, upstream_end)
|
||||||
|
downstream_flank = extract_orf_sequence(
|
||||||
|
genome, chrom, downstream_start, downstream_end
|
||||||
|
)
|
||||||
|
|
||||||
|
original = str(original).strip()
|
||||||
|
replacement = str(replacement).strip()
|
||||||
|
|
||||||
|
# --- 突变逻辑:所有替换(等长或不等长)统一使用 (ORIGINAL/REPLACEMENT) 格式 ---
|
||||||
|
mut_part = ""
|
||||||
|
if original and replacement:
|
||||||
|
# 替换或 Delins: (ORIGINAL/REPLACEMENT)
|
||||||
|
mut_part = f"({original}/{replacement})"
|
||||||
|
elif original:
|
||||||
|
# 删除: (-ORIGINAL)
|
||||||
|
mut_part = f"(-{original})"
|
||||||
|
elif replacement:
|
||||||
|
# 插入: (+REPLACEMENT)
|
||||||
|
mut_part = f"(+{replacement})"
|
||||||
|
else:
|
||||||
|
mut_part = "(Invalid mutation logic)"
|
||||||
|
|
||||||
|
return f"{upstream_flank}{mut_part}{downstream_flank}"
|
||||||
|
|
||||||
|
|
||||||
|
# --- 氨基酸突变处理 ---
|
||||||
|
def process_aa_mutations(df_aa: pd.DataFrame, genome_hg38: Fasta) -> pd.DataFrame:
|
||||||
|
"""处理氨基酸(AA)饱和诱变数据,并返回包含 EditSeq, strategy 和 mutation_type 的 DataFrame。"""
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# 性能优化:使用 to_dict('records') 替代 iterrows()
|
||||||
|
for row in df_aa.to_dict("records"):
|
||||||
|
original = (
|
||||||
|
str(row["origial_code"]).strip() if pd.notna(row["origial_code"]) else ""
|
||||||
|
)
|
||||||
|
replacement = (
|
||||||
|
str(row["mutation_code"]).strip() if pd.notna(row["mutation_code"]) else ""
|
||||||
|
)
|
||||||
|
|
||||||
|
# 确定突变类型
|
||||||
|
if original and replacement:
|
||||||
|
mut_type = "REPL"
|
||||||
|
elif original and not replacement:
|
||||||
|
mut_type = "DEL"
|
||||||
|
elif not original and replacement:
|
||||||
|
mut_type = "INS"
|
||||||
|
else:
|
||||||
|
mut_type = "UNKNOWN"
|
||||||
|
|
||||||
|
# 生成序列名称
|
||||||
|
seq_name = f"{row['gene']}_AA{row['aa_index']}_{row['origial_code']}_{row['mutation_code']}"
|
||||||
|
|
||||||
|
# 生成 EditSeq
|
||||||
|
editseq = generate_editseq(
|
||||||
|
original=row["origial_code"],
|
||||||
|
replacement=row["mutation_code"],
|
||||||
|
region_str=row["aa_region"],
|
||||||
|
genome=genome_hg38,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 收集结果,包括 'strategy' 和 'mutation_type' 列
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
|
"sequence_name": seq_name,
|
||||||
|
"editseq": editseq,
|
||||||
|
"strategy": row["strategy"],
|
||||||
|
"mutation_type": mut_type,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return pd.DataFrame(results)
|
||||||
|
|
||||||
|
|
||||||
|
# --- SNP/cDNA 突变处理 ---
|
||||||
|
def process_snp_mutations(
|
||||||
|
df_snp: pd.DataFrame, genome_hg19: Fasta, genome_hg38: Fasta
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""处理 SNP/cDNA 突变数据,返回包含 EditSeq 和 mutation_type 的 DataFrame。"""
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# 性能优化:使用 to_dict('records') 替代 iterrows()
|
||||||
|
for row in df_snp.to_dict("records"):
|
||||||
|
original = (
|
||||||
|
str(row["original_codon"]).strip()
|
||||||
|
if pd.notna(row["original_codon"])
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
replacement = (
|
||||||
|
str(row["mutation_code"]).strip() if pd.notna(row["mutation_code"]) else ""
|
||||||
|
)
|
||||||
|
|
||||||
|
# 根据版本选择合适的 Fasta 对象
|
||||||
|
version = str(row["version"]).lower()
|
||||||
|
genome_to_use = genome_hg38 if version == "hg38" else genome_hg19
|
||||||
|
|
||||||
|
# 确定用于序列命名的突变类型
|
||||||
|
if original and replacement:
|
||||||
|
mut_type = "REPL"
|
||||||
|
elif original and not replacement:
|
||||||
|
mut_type = "DEL"
|
||||||
|
elif not original and replacement:
|
||||||
|
mut_type = "INS"
|
||||||
|
else:
|
||||||
|
mut_type = "UNKNOWN"
|
||||||
|
|
||||||
|
# 构造序列名称
|
||||||
|
cdna_code_clean = str(row["cdna code"]).replace(".", "").replace("_", "p")
|
||||||
|
seq_name = f"{row['gene']}_{mut_type}_{cdna_code_clean}"
|
||||||
|
|
||||||
|
# 生成 EditSeq
|
||||||
|
editseq = generate_editseq(
|
||||||
|
original=original,
|
||||||
|
replacement=replacement,
|
||||||
|
region_str=str(row["mutation_region"]),
|
||||||
|
genome=genome_to_use,
|
||||||
|
)
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
{"sequence_name": seq_name, "editseq": editseq, "mutation_type": mut_type}
|
||||||
|
)
|
||||||
|
|
||||||
|
return pd.DataFrame(results)
|
||||||
|
|
||||||
|
|
||||||
|
def run_analysis(infile, reference, outdir):
|
||||||
|
# AA_INPUT_FILE = "/rawdata1/project/peRNA_design/gene_aa_target.csv"
|
||||||
|
# SNP_INPUT_FILE = "/rawdata1/project/peRNA_design/gene_snp_target.csv"
|
||||||
|
|
||||||
|
# genome_hg19 = Fasta(HG19_FASTA_PATH)
|
||||||
|
# genome_hg38 = Fasta(HG38_FASTA_PATH)
|
||||||
|
genome = Fasta(reference)
|
||||||
|
os.makedirs(outdir, exist_ok=True)
|
||||||
|
|
||||||
|
aa_df_input = pd.read_csv(infile)
|
||||||
|
# snp_df_input = pd.read_csv(SNP_INPUT_FILE)
|
||||||
|
|
||||||
|
# --- 阶段一: 处理 SNP/cDNA 突变并输出 ---
|
||||||
|
# snp_df = process_snp_mutations(snp_df_input, genome_hg19, genome_hg38)
|
||||||
|
# snp_output_file = "snp_editseq_output.csv"
|
||||||
|
# snp_df.to_csv(snp_output_file, index=False)
|
||||||
|
|
||||||
|
# --- 阶段二: 按 strategy 分组处理 AA 突变并输出 ---
|
||||||
|
aa_df_input["strategy"] = aa_df_input["strategy"].str.upper()
|
||||||
|
strategies = aa_df_input["strategy"].unique()
|
||||||
|
|
||||||
|
for strategy in strategies:
|
||||||
|
# 跳过空的 strategy
|
||||||
|
if pd.isna(strategy):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 过滤 DataFrame
|
||||||
|
aa_subset_df = aa_df_input[aa_df_input["strategy"] == strategy].copy()
|
||||||
|
if aa_subset_df.empty:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 处理子集
|
||||||
|
aa_df_processed = process_aa_mutations(aa_subset_df, genome)
|
||||||
|
|
||||||
|
# 保存
|
||||||
|
aa_output_file = f"aa_{strategy}_editseq_output.csv"
|
||||||
|
aa_df_processed.to_csv(os.path.join(outdir, aa_output_file), index=False)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_analysis()
|
||||||
53
design/src/liftover.py
Normal file
53
design/src/liftover.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from pyliftover import LiftOver
|
||||||
|
|
||||||
|
# lo = LiftOver('/home/zym/projects/pgRNA/liftover/hg19ToHg38.over.chain.gz')
|
||||||
|
|
||||||
|
from pyliftover import LiftOver
|
||||||
|
|
||||||
|
# 创建从 hg19 到 hg38 的转换器
|
||||||
|
lo = LiftOver("hg19", "hg38")
|
||||||
|
|
||||||
|
|
||||||
|
def convert_interval(chrom, start, end):
|
||||||
|
"""
|
||||||
|
将区间 (start, end) 从源基因组转换到目标基因组
|
||||||
|
返回: (new_chrom, new_start, new_end) 或 None
|
||||||
|
"""
|
||||||
|
# 注意:pyliftover 使用 1-based 坐标
|
||||||
|
# 如果你的 BED 是 0-based(如 BED 文件),start 需要 +1
|
||||||
|
# 这里假设输入是 1-based;如果是 0-based,请用 start+1, end
|
||||||
|
result_start = lo.convert_coordinate(chrom, start)
|
||||||
|
result_end = lo.convert_coordinate(chrom, end)
|
||||||
|
|
||||||
|
if not result_start or not result_end:
|
||||||
|
return None # 无法转换
|
||||||
|
|
||||||
|
# 取置信度最高的映射
|
||||||
|
best_start = max(result_start, key=lambda x: x[3])
|
||||||
|
best_end = max(result_end, key=lambda x: x[3])
|
||||||
|
|
||||||
|
new_chrom = best_start[0]
|
||||||
|
new_start = best_start[1]
|
||||||
|
new_end = best_end[1]
|
||||||
|
|
||||||
|
# 确保 start <= end
|
||||||
|
if new_start >= new_end:
|
||||||
|
new_start, new_end = new_end, new_start
|
||||||
|
return new_chrom, new_start, new_end
|
||||||
|
|
||||||
|
|
||||||
|
def get_seq(path, coord):
|
||||||
|
import pysam
|
||||||
|
|
||||||
|
# 打开 FASTA 文件
|
||||||
|
fasta = pysam.FastaFile(path) # 自动读取 genome.fa.fai
|
||||||
|
|
||||||
|
# 提取序列:chr1:1000000-1000100(0-based, [start, end))
|
||||||
|
# 注意:pysam 使用 0-based 坐标,与 BED 一致
|
||||||
|
return fasta.fetch(region=f"{coord[0]}:{coord[1]}-{coord[2]}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pass
|
||||||
216
design/src/mutation.py
Normal file
216
design/src/mutation.py
Normal file
@@ -0,0 +1,216 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
from Bio.Seq import Seq
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
# 遗传密码表:DNA -> 氨基酸
|
||||||
|
codon_table = {
|
||||||
|
"TTT": "F",
|
||||||
|
"TTC": "F",
|
||||||
|
"TTA": "L",
|
||||||
|
"TTG": "L",
|
||||||
|
"TCT": "S",
|
||||||
|
"TCC": "S",
|
||||||
|
"TCA": "S",
|
||||||
|
"TCG": "S",
|
||||||
|
"TAT": "Y",
|
||||||
|
"TAC": "Y",
|
||||||
|
"TAA": "*",
|
||||||
|
"TAG": "*",
|
||||||
|
"TGT": "C",
|
||||||
|
"TGC": "C",
|
||||||
|
"TGA": "*",
|
||||||
|
"TGG": "W",
|
||||||
|
"CTT": "L",
|
||||||
|
"CTC": "L",
|
||||||
|
"CTA": "L",
|
||||||
|
"CTG": "L",
|
||||||
|
"CCT": "P",
|
||||||
|
"CCC": "P",
|
||||||
|
"CCA": "P",
|
||||||
|
"CCG": "P",
|
||||||
|
"CAT": "H",
|
||||||
|
"CAC": "H",
|
||||||
|
"CAA": "Q",
|
||||||
|
"CAG": "Q",
|
||||||
|
"CGT": "R",
|
||||||
|
"CGC": "R",
|
||||||
|
"CGA": "R",
|
||||||
|
"CGG": "R",
|
||||||
|
"ATT": "I",
|
||||||
|
"ATC": "I",
|
||||||
|
"ATA": "I",
|
||||||
|
"ATG": "M",
|
||||||
|
"ACT": "T",
|
||||||
|
"ACC": "T",
|
||||||
|
"ACA": "T",
|
||||||
|
"ACG": "T",
|
||||||
|
"AAT": "N",
|
||||||
|
"AAC": "N",
|
||||||
|
"AAA": "K",
|
||||||
|
"AAG": "K",
|
||||||
|
"AGT": "S",
|
||||||
|
"AGC": "S",
|
||||||
|
"AGA": "R",
|
||||||
|
"AGG": "R",
|
||||||
|
"GTT": "V",
|
||||||
|
"GTC": "V",
|
||||||
|
"GTA": "V",
|
||||||
|
"GTG": "V",
|
||||||
|
"GCT": "A",
|
||||||
|
"GCC": "A",
|
||||||
|
"GCA": "A",
|
||||||
|
"GCG": "A",
|
||||||
|
"GAT": "D",
|
||||||
|
"GAC": "D",
|
||||||
|
"GAA": "E",
|
||||||
|
"GAG": "E",
|
||||||
|
"GGT": "G",
|
||||||
|
"GGC": "G",
|
||||||
|
"GGA": "G",
|
||||||
|
"GGG": "G",
|
||||||
|
}
|
||||||
|
|
||||||
|
# 反向查找:氨基酸 -> 密码子列表
|
||||||
|
aa_to_codons = {}
|
||||||
|
for codon, aa in codon_table.items():
|
||||||
|
if aa not in aa_to_codons:
|
||||||
|
aa_to_codons[aa] = []
|
||||||
|
aa_to_codons[aa].append(codon)
|
||||||
|
|
||||||
|
# 碱基
|
||||||
|
bases = ["A", "T", "G", "C"]
|
||||||
|
|
||||||
|
|
||||||
|
def generate_nnn():
|
||||||
|
"""生成所有 NNN 组合(64种)"""
|
||||||
|
return ["".join(c) for c in itertools.product(bases, repeat=3)]
|
||||||
|
|
||||||
|
|
||||||
|
def generate_2n_variants(original_codon, fixed_position=None):
|
||||||
|
"""
|
||||||
|
生成双N突变(两个位置随机,一个固定)
|
||||||
|
fixed_position: 0,1,2 表示哪个位置保持不变(0=第一个碱基)
|
||||||
|
若不指定,则生成所有三种模式:NNT, NTN, TNN
|
||||||
|
"""
|
||||||
|
variants = set()
|
||||||
|
for pos in [fixed_position] if fixed_position is not None else [0, 1, 2]:
|
||||||
|
fixed_base = original_codon[pos]
|
||||||
|
for b1 in bases:
|
||||||
|
for b2 in bases:
|
||||||
|
codon_list = ["_", "_", "_"]
|
||||||
|
codon_list[pos] = fixed_base
|
||||||
|
idx = 0
|
||||||
|
for i in range(3):
|
||||||
|
if i != pos:
|
||||||
|
codon_list[i] = [b1, b2][idx]
|
||||||
|
idx += 1
|
||||||
|
variant = "".join(codon_list)
|
||||||
|
variants.add(variant)
|
||||||
|
return sorted(variants)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_1n_variants(original_codon, fixed_positions=None):
|
||||||
|
"""
|
||||||
|
生成单N突变(一个位置随机,两个固定)
|
||||||
|
fixed_positions: 如 [0,1] 表示第0和第1位固定
|
||||||
|
若不指定,则生成所有三种模式:ANT, ATN, TAN
|
||||||
|
"""
|
||||||
|
variants = set()
|
||||||
|
if fixed_positions:
|
||||||
|
positions = [fixed_positions]
|
||||||
|
else:
|
||||||
|
positions = [[0, 1], [0, 2], [1, 2]]
|
||||||
|
|
||||||
|
for fix in positions:
|
||||||
|
var_pos = 3 - sum(fix) # 剩下那个位置是变量
|
||||||
|
for i in range(3):
|
||||||
|
if i not in fix:
|
||||||
|
var_pos = i
|
||||||
|
break
|
||||||
|
base1, base2 = original_codon[fix[0]], original_codon[fix[1]]
|
||||||
|
for b in bases:
|
||||||
|
codon_list = ["_", "_", "_"]
|
||||||
|
codon_list[fix[0]] = base1
|
||||||
|
codon_list[fix[1]] = base2
|
||||||
|
codon_list[var_pos] = b
|
||||||
|
variant = "".join(codon_list)
|
||||||
|
variants.add(variant)
|
||||||
|
return sorted(variants)
|
||||||
|
|
||||||
|
|
||||||
|
def translate(codon):
|
||||||
|
return codon_table.get(codon, "X")
|
||||||
|
|
||||||
|
|
||||||
|
def design_mutations_for_orf(dna_seq, strategy="3N"):
|
||||||
|
"""
|
||||||
|
对整个 ORF 序列进行饱和突变设计
|
||||||
|
strategy: '3N', '2N', '1N'
|
||||||
|
"""
|
||||||
|
if len(dna_seq) % 3 != 0:
|
||||||
|
raise ValueError(f"ORF 长度必须是 3 的倍数!{dna_seq}")
|
||||||
|
|
||||||
|
num_codons = len(dna_seq) // 3
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for i in range(num_codons):
|
||||||
|
start = i * 3
|
||||||
|
end = start + 3
|
||||||
|
orig_codon = dna_seq[start:end]
|
||||||
|
orig_aa = translate(orig_codon)
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"\n--- 位点 {i + 1} (氨基酸 {i + 1}): {orig_aa} ({orig_codon}) ---"
|
||||||
|
)
|
||||||
|
|
||||||
|
variants = []
|
||||||
|
if strategy == "3N":
|
||||||
|
variants = generate_nnn()
|
||||||
|
logger.debug(f"策略: 3N (NNN) → 共 {len(variants)} 种组合")
|
||||||
|
elif strategy == "2N":
|
||||||
|
variants = generate_2n_variants(orig_codon)
|
||||||
|
logger.debug(f"策略: 2N (任意两个随机) → 共 {len(variants)} 种组合")
|
||||||
|
elif strategy == "1N":
|
||||||
|
variants = generate_1n_variants(orig_codon)
|
||||||
|
logger.debug(f"策略: 1N (任意一个随机) → 共 {len(variants)} 种组合")
|
||||||
|
else:
|
||||||
|
raise ValueError("strategy 必须是 '3N', '2N', 或 '1N'")
|
||||||
|
|
||||||
|
# 过滤掉无效密码子(理论上不会)
|
||||||
|
valid_variants = [v for v in variants if len(v) == 3]
|
||||||
|
|
||||||
|
# 统计突变结果
|
||||||
|
mutant_aa_count = {}
|
||||||
|
stop_count = 0
|
||||||
|
for v in valid_variants:
|
||||||
|
aa = translate(v)
|
||||||
|
if aa == "*":
|
||||||
|
stop_count += 1
|
||||||
|
mutant_aa_count[aa] = mutant_aa_count.get(aa, 0) + 1
|
||||||
|
|
||||||
|
logger.debug(f"→ 共产生 {len(valid_variants)} 个有效突变")
|
||||||
|
logger.debug(f"→ 可产生 {len(mutant_aa_count)} 种不同氨基酸(含终止)")
|
||||||
|
logger.debug(f"→ 引入终止密码子: {stop_count} 次")
|
||||||
|
logger.debug(f"→ 氨基酸分布: {mutant_aa_count}")
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
|
"position": i + 1,
|
||||||
|
"original_codon": orig_codon,
|
||||||
|
"original_aa": orig_aa,
|
||||||
|
"variants": valid_variants,
|
||||||
|
"variant_count": len(valid_variants),
|
||||||
|
"mutant_aa_count": mutant_aa_count,
|
||||||
|
"stop_count": stop_count,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pass
|
||||||
236
design/src/reader.py
Normal file
236
design/src/reader.py
Normal file
@@ -0,0 +1,236 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
从gtf中读取CDS,并读取对应的sequence
|
||||||
|
"""
|
||||||
|
|
||||||
|
import gzip
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from Bio.Seq import Seq
|
||||||
|
from loguru import logger
|
||||||
|
from pyfaidx import Fasta
|
||||||
|
|
||||||
|
|
||||||
|
class Region(object):
|
||||||
|
"""记录坐标位点,用于融合"""
|
||||||
|
|
||||||
|
def __init__(self, chrom, start, end, strand="+", kind=None):
|
||||||
|
self.chrom = chrom
|
||||||
|
self.start = start
|
||||||
|
self.end = end
|
||||||
|
self.strand = strand
|
||||||
|
self.sequence = None
|
||||||
|
self.kind_ = kind
|
||||||
|
self.addition = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create(cls, region):
|
||||||
|
chrom, sites = region.split(":")
|
||||||
|
sites = [int(x) for x in sites.split("-")]
|
||||||
|
return cls(chrom, sites[0], sites[-1], "+")
|
||||||
|
|
||||||
|
def set_seq(self, sequence: str):
|
||||||
|
self.sequence = sequence.upper()
|
||||||
|
|
||||||
|
# 如果是负链,需要反向互补
|
||||||
|
if self.strand == "-":
|
||||||
|
self.sequence = str(Seq(sequence).reverse_complement())
|
||||||
|
|
||||||
|
def __and__(self, other):
|
||||||
|
if self.chrom != other.chrom:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return self.start < other.end and self.end > other.start
|
||||||
|
|
||||||
|
def __add__(self, other):
|
||||||
|
if not self & other:
|
||||||
|
raise ValueError("没有重合位点")
|
||||||
|
|
||||||
|
self.start = min(self.start, other.start)
|
||||||
|
self.end = max(self.end, other.end)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return f"{self.chrom}:{self.start}-{self.end}"
|
||||||
|
|
||||||
|
def __hash__(self):
|
||||||
|
return hash(str(self))
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return self.end - self.start
|
||||||
|
|
||||||
|
@property
|
||||||
|
def kind(self):
|
||||||
|
if len(self) >= 3:
|
||||||
|
return self.kind_
|
||||||
|
if not self.kind_:
|
||||||
|
return ""
|
||||||
|
else:
|
||||||
|
return f"cross_{self.kind_}"
|
||||||
|
|
||||||
|
|
||||||
|
def read_gtf(gtf_path):
|
||||||
|
"""
|
||||||
|
读取 GTF 文件,返回 DataFrame
|
||||||
|
"""
|
||||||
|
logger.info("正在读取 GTF 文件...")
|
||||||
|
columns = [
|
||||||
|
"seqname",
|
||||||
|
"source",
|
||||||
|
"feature",
|
||||||
|
"start",
|
||||||
|
"end",
|
||||||
|
"score",
|
||||||
|
"strand",
|
||||||
|
"frame",
|
||||||
|
"attribute",
|
||||||
|
]
|
||||||
|
|
||||||
|
df = pd.read_csv(
|
||||||
|
gtf_path, sep="\t", comment="#", header=None, names=columns, low_memory=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# 过滤出 CDS 行
|
||||||
|
cds_df = df[df["feature"] == "CDS"].copy()
|
||||||
|
|
||||||
|
# 解析 attribute 列,展开为多个列
|
||||||
|
# 使用 pd.json_normalize 将字典列表转换为 DataFrame
|
||||||
|
try:
|
||||||
|
attributes_df = pd.json_normalize(cds_df["attribute"].apply(parse_attributes))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"解析 attribute 字段失败: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
# 将原始列与解析后的属性列合并
|
||||||
|
result_df = pd.concat([cds_df.reset_index(drop=True), attributes_df], axis=1)
|
||||||
|
|
||||||
|
logger.info(f"成功读取并解析 GTF 文件,共 {len(result_df)} 个 CDS 特征。")
|
||||||
|
return result_df
|
||||||
|
|
||||||
|
|
||||||
|
def parse_attributes(attr_str):
|
||||||
|
"""
|
||||||
|
解析 GTF 的 attribute 字段,返回字典
|
||||||
|
"""
|
||||||
|
attributes = {}
|
||||||
|
for item in attr_str.split(";"):
|
||||||
|
item = item.strip()
|
||||||
|
if not item:
|
||||||
|
continue
|
||||||
|
if " " in item:
|
||||||
|
key, value = item.split(" ", 1)
|
||||||
|
attributes[key] = value.strip('"')
|
||||||
|
return attributes
|
||||||
|
|
||||||
|
|
||||||
|
def get_cds_for_gene(cds_df, gene_name):
|
||||||
|
"""
|
||||||
|
提取指定基因的所有 CDS 条目,并按转录本分组,选择最长的转录本
|
||||||
|
"""
|
||||||
|
logger.info(f"正在查找基因 '{gene_name}' 的 CDS...")
|
||||||
|
|
||||||
|
# 添加解析后的属性
|
||||||
|
cds_df["attributes_parsed"] = cds_df["attribute"].apply(parse_attributes)
|
||||||
|
|
||||||
|
# 筛选包含该基因名的行
|
||||||
|
gene_cds_list = []
|
||||||
|
for idx, row in cds_df.iterrows():
|
||||||
|
attrs = row["attributes_parsed"]
|
||||||
|
if attrs.get("transcript_id") == gene_name:
|
||||||
|
# if attrs.get('gene_name') == gene_name or attrs.get('gene_id').startswith(gene_name):
|
||||||
|
gene_cds_list.append(row)
|
||||||
|
|
||||||
|
if not gene_cds_list:
|
||||||
|
raise ValueError(f"未在 GTF 中找到基因 '{gene_name}'")
|
||||||
|
|
||||||
|
df = pd.DataFrame(gene_cds_list)
|
||||||
|
df = df[
|
||||||
|
["seqname", "feature", "start", "end", "strand", "transcript_id"]
|
||||||
|
].drop_duplicates()
|
||||||
|
|
||||||
|
res = []
|
||||||
|
last = None
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
temp = Region(
|
||||||
|
str(row["seqname"]),
|
||||||
|
row["start"],
|
||||||
|
row["end"],
|
||||||
|
str(row["strand"]),
|
||||||
|
row["transcript_id"],
|
||||||
|
)
|
||||||
|
if last is None:
|
||||||
|
last = temp
|
||||||
|
elif temp & last:
|
||||||
|
last = last + temp
|
||||||
|
else:
|
||||||
|
res.append(last)
|
||||||
|
last = temp
|
||||||
|
if last not in res:
|
||||||
|
res.append(last)
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def load_uniprot_region(path):
|
||||||
|
res = []
|
||||||
|
last = None
|
||||||
|
with open(path) as r:
|
||||||
|
for line in r:
|
||||||
|
if line.startswith("#"):
|
||||||
|
continue
|
||||||
|
temp = Region.create(line.split()[0])
|
||||||
|
if last is None:
|
||||||
|
last = temp
|
||||||
|
elif temp & last:
|
||||||
|
last = last + temp
|
||||||
|
else:
|
||||||
|
res.append(last)
|
||||||
|
last = temp
|
||||||
|
|
||||||
|
if last not in res:
|
||||||
|
res.append(last)
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def extract_orf_sequence(genome_fasta, cds_rows, half_open=False):
|
||||||
|
"""
|
||||||
|
从参考基因组中提取 CDS 并拼接成 ORF
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not cds_rows:
|
||||||
|
raise ValueError("not cds")
|
||||||
|
|
||||||
|
seqname = cds_rows[0].chrom
|
||||||
|
strand = cds_rows[0].strand
|
||||||
|
|
||||||
|
logger.debug(f"从参考基因组提取序列 (chr{seqname})...")
|
||||||
|
genome = Fasta(genome_fasta)
|
||||||
|
|
||||||
|
# 获取染色体序列
|
||||||
|
try:
|
||||||
|
chrom_seq = genome[seqname] # 如 "chr1", "1" 等,根据 FASTA 命名调整
|
||||||
|
except KeyError:
|
||||||
|
if "chr" in seqname:
|
||||||
|
seqname = seqname.replace("chr", "")
|
||||||
|
else:
|
||||||
|
seqname = "chr" + seqname
|
||||||
|
chrom_seq = genome[seqname] # 如 "chr1", "1" 等,根据 FASTA 命名调整
|
||||||
|
|
||||||
|
for row in cds_rows:
|
||||||
|
start = int(row.start) - 1 # GTF 是 1-based,pyfaidx 是 0-based
|
||||||
|
end = int(row.end) - (1 if half_open else 0)
|
||||||
|
|
||||||
|
if len(row) < 3 and "cross" in row.kind:
|
||||||
|
if row.kind == "cross_start":
|
||||||
|
start = end - 3
|
||||||
|
else:
|
||||||
|
end = start + 3
|
||||||
|
|
||||||
|
row.set_seq(chrom_seq[start:end].seq)
|
||||||
|
|
||||||
|
return cds_rows
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pass
|
||||||
139
design/src/safe_target.py
Normal file
139
design/src/safe_target.py
Normal file
@@ -0,0 +1,139 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
生成safe targeting 序列
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import random
|
||||||
|
import pysam
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
seed = 42
|
||||||
|
random.seed(42)
|
||||||
|
|
||||||
|
|
||||||
|
__AAs__ = {
|
||||||
|
"丙氨酸": ["GCU", "GCC", "GCA", "GCG"],
|
||||||
|
"精氨酸": ["CGU", "CGC", "CGA", "CGG", "AGA", "AGG"],
|
||||||
|
"天冬酰胺": ["AAU", "AAC"],
|
||||||
|
"天冬氨酸": ["GAU", "GAC"],
|
||||||
|
"半胱氨酸": ["UGU", "UGC"],
|
||||||
|
"谷氨酰胺": ["CAA", "CAG"],
|
||||||
|
"谷氨酸": ["GAA", "GAG"],
|
||||||
|
"甘氨酸": ["GGU", "GGC", "GGA", "GGG"],
|
||||||
|
"组氨酸": ["CAU", "CAC"],
|
||||||
|
"异亮氨酸": ["AUU", "AUC", "AUA"],
|
||||||
|
"亮氨酸": ["UUA", "UUG", "CUU", "CUC", "CUA", "CUG"],
|
||||||
|
"赖氨酸": ["AAA", "AAG"],
|
||||||
|
"甲硫氨酸": ["AUG"],
|
||||||
|
"苯丙氨酸": ["UUU", "UUC"],
|
||||||
|
"脯氨酸": ["CCU", "CCC", "CCA", "CCG"],
|
||||||
|
"丝氨酸": ["UCU", "UCC", "UCA", "UCG", "AGU", "AGC"],
|
||||||
|
"苏氨酸": ["ACU", "ACC", "ACA", "ACG"],
|
||||||
|
"色氨酸": ["UGG"],
|
||||||
|
"酪氨酸": ["UAU", "UAC"],
|
||||||
|
"缬氨酸": ["GUU", "GUC", "GUA", "GUG"],
|
||||||
|
"终止密码子": ["UAA", "UAG", "UGA"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def codons():
|
||||||
|
for key, values in __AAs__.items():
|
||||||
|
yield key, [value.replace("U", "T") for value in values]
|
||||||
|
|
||||||
|
|
||||||
|
class Region:
|
||||||
|
"""Represents a 3bp codon region in the full sequence."""
|
||||||
|
|
||||||
|
def __init__(self, chrom: str, start: int, end: int):
|
||||||
|
self.chrom = chrom
|
||||||
|
self.start = start
|
||||||
|
self.end = end
|
||||||
|
self.__shift__ = 0
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"{self.chrom}:{self.start}-{self.end}"
|
||||||
|
|
||||||
|
def shift(self, fasta: str):
|
||||||
|
for i in range(0, self.end - self.start):
|
||||||
|
if self.__shift__ != 0:
|
||||||
|
break
|
||||||
|
seq = extract_fastq_seq(fasta, Region(self.chrom, self.start+i, self.start+i+3))
|
||||||
|
|
||||||
|
for _, values in codons():
|
||||||
|
if seq in values:
|
||||||
|
self.__shift__ = i
|
||||||
|
break
|
||||||
|
|
||||||
|
def choose(self, number: int = 3):
|
||||||
|
length_of_codon = 3
|
||||||
|
|
||||||
|
regions = []
|
||||||
|
for i in range(self.start + self.__shift__, self.end, length_of_codon):
|
||||||
|
if i + length_of_codon > self.end:
|
||||||
|
break
|
||||||
|
regions.append([i, i + length_of_codon])
|
||||||
|
|
||||||
|
# np.choice(my_list, size=3, replace=False)
|
||||||
|
|
||||||
|
if number > len(regions):
|
||||||
|
return [Region(self.chrom, x[0], x[1]) for x in regions]
|
||||||
|
|
||||||
|
return [Region(self.chrom, x[0], x[1]) for x in random.sample(regions, number)]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_fastq_seq(fastq: str, region: Region, seq_len: int = 100):
|
||||||
|
with pysam.FastaFile(fastq) as fh:
|
||||||
|
rec = fh.fetch(region.chrom, region.start, region.end)
|
||||||
|
# print(rec)
|
||||||
|
return rec
|
||||||
|
|
||||||
|
|
||||||
|
def mutation(seq: str):
|
||||||
|
random.seed(seed)
|
||||||
|
for key, value in __AAs__.items():
|
||||||
|
if seq in value:
|
||||||
|
random_keys = random.sample([x for x in __AAs__.keys() if x != key], 1)[0]
|
||||||
|
return random.sample(__AAs__[random_keys], 1)[0].replace("U", "T")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def main(infile, outfile, reference = "../ref/UCSC/hg19.fa.gz", seq_len: int = 100):
|
||||||
|
|
||||||
|
meta = pd.read_excel(infile, sheet_name="Human Safe Regions", header=None)
|
||||||
|
meta = meta.sample(n=2000, random_state=seed)
|
||||||
|
|
||||||
|
data = []
|
||||||
|
for idx in tqdm(meta.iloc[:, 0], total=meta.shape[0]):
|
||||||
|
|
||||||
|
idx = idx.split(";")
|
||||||
|
region = Region(idx[0], int(idx[1]), int(idx[2]))
|
||||||
|
region.shift(reference)
|
||||||
|
|
||||||
|
regions = region.choose(5)
|
||||||
|
|
||||||
|
for reg in regions:
|
||||||
|
seq = extract_fastq_seq(reference, reg)
|
||||||
|
mut = mutation(seq)
|
||||||
|
|
||||||
|
if seq is None or mut is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
key = str(reg) + "_" + seq + "_" + mut
|
||||||
|
before = extract_fastq_seq(reference, Region(region.chrom, reg.start - seq_len, reg.start))
|
||||||
|
after = extract_fastq_seq(reference, Region(region.chrom, reg.end, reg.end + seq_len))
|
||||||
|
|
||||||
|
seq = f"{before}({seq}/{mut}){after}"
|
||||||
|
data.append({"sequence_name": key, "editseq": seq})
|
||||||
|
|
||||||
|
data = pd.DataFrame(data)
|
||||||
|
data.to_csv(outfile, index=False)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from fire import Fire
|
||||||
|
Fire(main)
|
||||||
|
|
||||||
114
design/src/snp.py
Normal file
114
design/src/snp.py
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""用来解析snp错配信息"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from itertools import product
|
||||||
|
|
||||||
|
|
||||||
|
def generate_sequences_with_combinations(seq):
|
||||||
|
"""
|
||||||
|
将 DNA 序列中连续的 N 替换为所有可能的 A/T/C/G 组合,
|
||||||
|
返回所有可能的序列列表。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
seq (str): 输入的 DNA 序列,可包含 N
|
||||||
|
|
||||||
|
返回:
|
||||||
|
list: 所有可能的序列(字符串列表)
|
||||||
|
"""
|
||||||
|
if "N" not in seq:
|
||||||
|
return [seq]
|
||||||
|
|
||||||
|
# 分割序列,保留分隔符信息
|
||||||
|
segments = []
|
||||||
|
i = 0
|
||||||
|
while i < len(seq):
|
||||||
|
if seq[i] == "N":
|
||||||
|
j = i
|
||||||
|
while j < len(seq) and seq[j] == "N":
|
||||||
|
j += 1
|
||||||
|
length = j - i
|
||||||
|
segments.append(("N", length)) # ('N', 3) 表示连续3个N
|
||||||
|
i = j
|
||||||
|
else:
|
||||||
|
j = i
|
||||||
|
while j < len(seq) and seq[j] != "N":
|
||||||
|
j += 1
|
||||||
|
segments.append(("seq", seq[i:j]))
|
||||||
|
i = j
|
||||||
|
|
||||||
|
# 提取每个 N 块的可能组合
|
||||||
|
n_block_options = []
|
||||||
|
for seg_type, content in segments:
|
||||||
|
if seg_type == "N":
|
||||||
|
# 生成所有长度为 content 的 ATCG 组合
|
||||||
|
options = ["".join(p) for p in product("ATCG", repeat=content)]
|
||||||
|
n_block_options.append(options)
|
||||||
|
|
||||||
|
# 如果没有 N,直接返回原序列
|
||||||
|
if not n_block_options:
|
||||||
|
return [seq]
|
||||||
|
|
||||||
|
# 使用 itertools.product 生成所有组合
|
||||||
|
from itertools import product as iter_product
|
||||||
|
|
||||||
|
all_combinations = list(iter_product(*n_block_options))
|
||||||
|
|
||||||
|
# 构建所有可能的序列
|
||||||
|
results = []
|
||||||
|
for combo in all_combinations:
|
||||||
|
new_seq = ""
|
||||||
|
n_index = 0
|
||||||
|
for seg_type, content in segments:
|
||||||
|
if seg_type == "seq":
|
||||||
|
new_seq += content
|
||||||
|
elif seg_type == "N":
|
||||||
|
new_seq += combo[n_index]
|
||||||
|
n_index += 1
|
||||||
|
results.append(new_seq)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def decode_snp(label, ref_start=0):
|
||||||
|
if label is None:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
if ":" in label:
|
||||||
|
label = label.split(":")[-1]
|
||||||
|
|
||||||
|
if ref_start <= 0 and not label.startswith("g."):
|
||||||
|
raise ValueError(f"{label} not genomic label")
|
||||||
|
elif ref_start > 0 and not label.startswith("c."):
|
||||||
|
raise ValueError(f"{label} not cdna label")
|
||||||
|
|
||||||
|
label = re.sub(r"([cg]\.|\[\d+\])", "", label)
|
||||||
|
|
||||||
|
sites = []
|
||||||
|
|
||||||
|
for x in label.split("_"):
|
||||||
|
if not x:
|
||||||
|
continue
|
||||||
|
|
||||||
|
x = re.sub(r"[^\d\+-]", "", x)
|
||||||
|
if "+" in x:
|
||||||
|
x = [int(y) for y in x.split("+")]
|
||||||
|
x = x[0] + x[-1]
|
||||||
|
elif "-" in x:
|
||||||
|
x = [int(y) for y in x.split("-")]
|
||||||
|
x = x[0] + x[-1]
|
||||||
|
else:
|
||||||
|
x = int(x)
|
||||||
|
|
||||||
|
sites.append(x + ref_start)
|
||||||
|
|
||||||
|
sites = sorted(sites)
|
||||||
|
|
||||||
|
rule = re.sub(r"[\d_\+-]", "", label)
|
||||||
|
return sites, rule.strip()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pass
|
||||||
|
|
||||||
263
design/src/snv-N-2N-3N.py
Normal file
263
design/src/snv-N-2N-3N.py
Normal file
@@ -0,0 +1,263 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import itertools
|
||||||
|
import random
|
||||||
|
import gzip
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
# ps
|
||||||
|
# 将目标序列按3bp密码子分割,每个密码子系统性地生成三类突变
|
||||||
|
# 为每个突变自动附加100bp的上下游侧翼序列,每类随机抽取150个突变
|
||||||
|
|
||||||
|
# Mutation design constants
|
||||||
|
NUCLEOTIDES = ["A", "T", "C", "G"]
|
||||||
|
UPSTREAM_LEN = 100
|
||||||
|
DOWNSTREAM_LEN = 100
|
||||||
|
TARGET_MUTATIONS = 150
|
||||||
|
|
||||||
|
|
||||||
|
class Region:
|
||||||
|
"""Represents a 3bp codon region in the full sequence."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, chrom: str, start: int, end: int, sequence: str, absolute_index: int
|
||||||
|
):
|
||||||
|
self.chrom = chrom
|
||||||
|
self.start = start
|
||||||
|
self.end = end
|
||||||
|
self.sequence = sequence.upper()
|
||||||
|
self.absolute_index = absolute_index
|
||||||
|
|
||||||
|
|
||||||
|
def read_fasta(fasta_path: str) -> Dict[str, str]:
|
||||||
|
"""Parses FASTA file, returning {header: sequence}."""
|
||||||
|
sequences = {}
|
||||||
|
current_header: Optional[str] = None
|
||||||
|
opener = gzip.open if fasta_path.endswith(".gz") else open
|
||||||
|
|
||||||
|
if not os.path.exists(fasta_path):
|
||||||
|
logger.error(f"FASTA file not found: {fasta_path}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
with opener(fasta_path, "rt") as f:
|
||||||
|
current_seq: List[str] = []
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if line.startswith(">"):
|
||||||
|
if current_header and current_seq:
|
||||||
|
sequences[current_header] = (
|
||||||
|
"".join(current_seq).upper().replace("U", "T")
|
||||||
|
)
|
||||||
|
|
||||||
|
current_header = line[1:].split()[0]
|
||||||
|
current_seq = []
|
||||||
|
else:
|
||||||
|
if current_header:
|
||||||
|
current_seq.append(line)
|
||||||
|
|
||||||
|
if current_header and current_seq:
|
||||||
|
sequences[current_header] = "".join(current_seq).upper().replace("U", "T")
|
||||||
|
|
||||||
|
return sequences
|
||||||
|
|
||||||
|
|
||||||
|
def split_sequence_to_codons(full_seq: str, gene_name: str) -> List[Region]:
|
||||||
|
"""Splits full sequence into 3bp Region objects."""
|
||||||
|
regions: List[Region] = []
|
||||||
|
|
||||||
|
for i in range(0, len(full_seq), 3):
|
||||||
|
codon = full_seq[i : i + 3]
|
||||||
|
if len(codon) == 3:
|
||||||
|
regions.append(Region(gene_name, i, i + 2, codon, i))
|
||||||
|
return regions
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_variant(ref: str, alt: str) -> str:
|
||||||
|
"""Simplifies substitution representation (e.g., 'CAT'->'CGT' becomes 'C(A/G)T')."""
|
||||||
|
if len(ref) != len(alt):
|
||||||
|
return f"({ref}/{alt})"
|
||||||
|
|
||||||
|
diffs = [
|
||||||
|
{"index": i, "ref_base": ref[i], "alt_base": alt[i]}
|
||||||
|
for i in range(len(ref))
|
||||||
|
if ref[i] != alt[i]
|
||||||
|
]
|
||||||
|
|
||||||
|
if not diffs:
|
||||||
|
return ref
|
||||||
|
|
||||||
|
positions = [d["index"] for d in diffs]
|
||||||
|
is_consecutive = all(
|
||||||
|
positions[i + 1] - positions[i] == 1 for i in range(len(positions) - 1)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Rule 1: Continuous differences (e.g., 'CAT'->'CGC' becomes 'C(AT/GC)')
|
||||||
|
if is_consecutive:
|
||||||
|
start_pos, end_pos = positions[0], positions[-1] + 1
|
||||||
|
return f"{ref[:start_pos]}({ref[start_pos:end_pos]}/{alt[start_pos:end_pos]}){ref[end_pos:]}"
|
||||||
|
|
||||||
|
# Rule 2: Intermittent differences (e.g., 'GTT'->'GCG' becomes 'G(T/C)(T/G)')
|
||||||
|
out = []
|
||||||
|
prev_end = 0
|
||||||
|
for d in diffs:
|
||||||
|
pos, r, a = d["index"], d["ref_base"], d["alt_base"]
|
||||||
|
out.append(ref[prev_end:pos])
|
||||||
|
out.append(f"({r}/{a})")
|
||||||
|
prev_end = pos + 1
|
||||||
|
out.append(ref[prev_end:])
|
||||||
|
|
||||||
|
return "".join(out)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_codon_mutations(original_codon: str, n_mutations: int) -> List[str]:
|
||||||
|
"""Generates all codon variants with exactly n_mutations."""
|
||||||
|
mutants = set()
|
||||||
|
codon_length = len(original_codon)
|
||||||
|
|
||||||
|
for indices in itertools.combinations(range(codon_length), n_mutations):
|
||||||
|
base_options: List[List[str]] = []
|
||||||
|
for i in range(codon_length):
|
||||||
|
if i in indices:
|
||||||
|
options = [b for b in NUCLEOTIDES if b != original_codon[i]]
|
||||||
|
else:
|
||||||
|
options = [original_codon[i]]
|
||||||
|
base_options.append(options)
|
||||||
|
|
||||||
|
for combination in itertools.product(*base_options):
|
||||||
|
mutant_codon = "".join(combination)
|
||||||
|
if mutant_codon != original_codon:
|
||||||
|
mutants.add(mutant_codon)
|
||||||
|
|
||||||
|
return sorted(list(mutants))
|
||||||
|
|
||||||
|
|
||||||
|
def generate_editseq_and_metadata(
|
||||||
|
full_seq: str, regions: List[Region], gene_name: str
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""Generates all mutations (1N, 2N, 3N) and constructs the final DataFrame."""
|
||||||
|
df_list: List[Dict[str, str]] = []
|
||||||
|
|
||||||
|
for idx, x in enumerate(regions):
|
||||||
|
abs_start = x.absolute_index
|
||||||
|
original_codon = x.sequence
|
||||||
|
|
||||||
|
# 1. Extract flanking sequences
|
||||||
|
flank_up = full_seq[max(0, abs_start - UPSTREAM_LEN) : abs_start]
|
||||||
|
flank_down = full_seq[
|
||||||
|
abs_start + 3 : min(len(full_seq), abs_start + 3 + DOWNSTREAM_LEN)
|
||||||
|
]
|
||||||
|
|
||||||
|
for strategy, n_mut in [("3N", 3), ("2N", 2), ("1N", 1)]:
|
||||||
|
variants = generate_codon_mutations(original_codon, n_mut)
|
||||||
|
|
||||||
|
for mutation_codon in variants:
|
||||||
|
# Use analyze_variant to simplify representation
|
||||||
|
simplified_codon = analyze_variant(original_codon, mutation_codon)
|
||||||
|
|
||||||
|
# sequence_name: GENE_SUB_STRATEGY_AAINDEX_ORIGINAL>MUTATION
|
||||||
|
seq_name = f"{gene_name}_SUB_{strategy}_AA{idx + 1}_{original_codon}>{mutation_codon}"
|
||||||
|
|
||||||
|
# editseq: flank_up + simplified_codon + flank_down
|
||||||
|
edit_seq = f"{flank_up}{simplified_codon}{flank_down}"
|
||||||
|
|
||||||
|
df_list.append(
|
||||||
|
{
|
||||||
|
"sequence_name": seq_name,
|
||||||
|
"editseq": edit_seq,
|
||||||
|
"strategy": strategy,
|
||||||
|
"mutation_type": "REPL", # Replacement
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return pd.DataFrame(df_list)
|
||||||
|
|
||||||
|
|
||||||
|
def run_mutation_design(fasta_file: str, gene_name: str, output_base_name: str):
|
||||||
|
"""Executes the mutation design pipeline and saves 3 separate files."""
|
||||||
|
|
||||||
|
logger.info(f"Targeting gene: {gene_name}")
|
||||||
|
fasta_data = read_fasta(fasta_file)
|
||||||
|
full_seq, target_id = "", ""
|
||||||
|
|
||||||
|
# Locate target sequence
|
||||||
|
for seq_id, seq in fasta_data.items():
|
||||||
|
if gene_name.upper() in seq_id.upper():
|
||||||
|
full_seq = seq
|
||||||
|
target_id = seq_id
|
||||||
|
break
|
||||||
|
|
||||||
|
if not full_seq and fasta_data:
|
||||||
|
# Fallback: use longest sequence
|
||||||
|
target_id, full_seq = max(fasta_data.items(), key=lambda item: len(item[1]))
|
||||||
|
if full_seq:
|
||||||
|
logger.warning(
|
||||||
|
f"Using longest sequence ID: {target_id} (Length: {len(full_seq)} bp)"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not full_seq:
|
||||||
|
logger.error(f"Failed to extract target sequence.")
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info(f"Target sequence ID: {target_id}, Length: {len(full_seq)} bp")
|
||||||
|
|
||||||
|
# 1. Generate ALL mutations (1N, 2N, 3N)
|
||||||
|
cds_regions = split_sequence_to_codons(full_seq, gene_name)
|
||||||
|
all_mutations_df = generate_editseq_and_metadata(full_seq, cds_regions, gene_name)
|
||||||
|
|
||||||
|
# 2. Process and save
|
||||||
|
strategies = ["1N", "2N", "3N"]
|
||||||
|
|
||||||
|
for strategy in strategies:
|
||||||
|
# Filter for the current strategy
|
||||||
|
strategy_df = all_mutations_df[all_mutations_df["strategy"] == strategy].copy()
|
||||||
|
original_count = len(strategy_df)
|
||||||
|
|
||||||
|
# Determine output file name (e.g., AAVS1_1N_150_mutations.csv)
|
||||||
|
output_file_name = output_base_name.replace("{strategy}", strategy)
|
||||||
|
|
||||||
|
if original_count == 0:
|
||||||
|
logger.warning(
|
||||||
|
f"Strategy {strategy}: No mutations generated. Skipping file creation for {output_file_name}."
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Random sampling for the current strategy
|
||||||
|
if original_count > TARGET_MUTATIONS:
|
||||||
|
final_df = strategy_df.sample(n=TARGET_MUTATIONS, random_state=42)
|
||||||
|
logger.success(
|
||||||
|
f"Strategy {strategy}: Sampled {TARGET_MUTATIONS} mutations from {original_count} designs."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
final_df = strategy_df
|
||||||
|
logger.warning(
|
||||||
|
f"Strategy {strategy}: Generated {original_count} mutations; saving all."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Save result, ensuring column order
|
||||||
|
final_df[["sequence_name", "editseq", "strategy", "mutation_type"]].to_csv(
|
||||||
|
output_file_name, index=False
|
||||||
|
)
|
||||||
|
logger.success(f"Strategy {strategy}: Design saved to {output_file_name}.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
AAVS1_FASTA_PATH = (
|
||||||
|
"/rawdata1/project/peRNA_design/ref/AAVS1/ncbi_dataset/data/rna.fna"
|
||||||
|
)
|
||||||
|
GENE_NAME = "AAVS1"
|
||||||
|
OUTPUT_BASE_NAME = "AAVS1_{strategy}_150_mutations.csv"
|
||||||
|
|
||||||
|
run_mutation_design(
|
||||||
|
fasta_file=AAVS1_FASTA_PATH,
|
||||||
|
gene_name=GENE_NAME,
|
||||||
|
output_base_name=OUTPUT_BASE_NAME,
|
||||||
|
)
|
||||||
87
filter.py
Normal file
87
filter.py
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import os
|
||||||
|
import gzip
|
||||||
|
from glob import glob
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def amino_acid_to_codon(amino_acid):
|
||||||
|
"""
|
||||||
|
简化的氨基酸到密码子转换函数
|
||||||
|
|
||||||
|
参数:
|
||||||
|
amino_acid (str): 单字母氨基酸代码
|
||||||
|
|
||||||
|
返回:
|
||||||
|
list: 可能的密码子列表
|
||||||
|
"""
|
||||||
|
genetic_code = {
|
||||||
|
'A': ['GCT', 'GCC', 'GCA', 'GCG'],
|
||||||
|
'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
|
||||||
|
'N': ['AAT', 'AAC'],
|
||||||
|
'D': ['GAT', 'GAC'],
|
||||||
|
'C': ['TGT', 'TGC'],
|
||||||
|
'E': ['GAA', 'GAG'],
|
||||||
|
'Q': ['CAA', 'CAG'],
|
||||||
|
'G': ['GGT', 'GGC', 'GGA', 'GGG'],
|
||||||
|
'H': ['CAT', 'CAC'],
|
||||||
|
'I': ['ATT', 'ATC', 'ATA'],
|
||||||
|
'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
|
||||||
|
'K': ['AAA', 'AAG'],
|
||||||
|
'M': ['ATG'],
|
||||||
|
'F': ['TTT', 'TTC'],
|
||||||
|
'P': ['CCT', 'CCC', 'CCA', 'CCG'],
|
||||||
|
'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
|
||||||
|
'T': ['ACT', 'ACC', 'ACA', 'ACG'],
|
||||||
|
'W': ['TGG'],
|
||||||
|
'Y': ['TAT', 'TAC'],
|
||||||
|
'V': ['GTT', 'GTC', 'GTA', 'GTG'],
|
||||||
|
'*': ['TAA', 'TAG', 'TGA'],
|
||||||
|
}
|
||||||
|
|
||||||
|
return genetic_code.get(amino_acid.upper(), [])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def main(ref, infile, outfile):
|
||||||
|
print(infile, outfile)
|
||||||
|
df = pd.read_excel(ref, 1)
|
||||||
|
|
||||||
|
keys = {}
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
row = list(row)
|
||||||
|
for src in amino_acid_to_codon(row[1]):
|
||||||
|
keys[f"{src}_{row[2]}"] = 0
|
||||||
|
|
||||||
|
if os.path.dirname(outfile):
|
||||||
|
os.makedirs(os.path.dirname(outfile), exist_ok = True)
|
||||||
|
|
||||||
|
header = False
|
||||||
|
with gzip.open(outfile, "wt+") as w:
|
||||||
|
with gzip.open(infile, "rt") as r:
|
||||||
|
for line in r:
|
||||||
|
if not header:
|
||||||
|
w.write(line.strip() + "\n")
|
||||||
|
header = line.strip().split(",")
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
target = header.index("sequence_name")
|
||||||
|
except ValueError:
|
||||||
|
target = header.index("Target_name")
|
||||||
|
|
||||||
|
key = line.strip().split(",")[target]
|
||||||
|
key = key.split("_")[2:]
|
||||||
|
key = "_".join(key).strip('"')
|
||||||
|
|
||||||
|
if key in keys:
|
||||||
|
w.write(line.strip() + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from fire import Fire
|
||||||
|
Fire(main)
|
||||||
|
|
||||||
210
filter_freq.py
Normal file
210
filter_freq.py
Normal file
@@ -0,0 +1,210 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import os
|
||||||
|
import gzip
|
||||||
|
import heapq
|
||||||
|
import click
|
||||||
|
import csv
|
||||||
|
import polars as pl
|
||||||
|
|
||||||
|
from multiprocessing import Pool
|
||||||
|
|
||||||
|
from glob import glob
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
def amino_acid_to_codon():
|
||||||
|
"""
|
||||||
|
简化的氨基酸到密码子转换函数
|
||||||
|
|
||||||
|
参数:
|
||||||
|
amino_acid (str): 单字母氨基酸代码
|
||||||
|
|
||||||
|
返回:
|
||||||
|
list: 可能的密码子列表
|
||||||
|
"""
|
||||||
|
genetic_code = {
|
||||||
|
'A': ['GCT', 'GCC', 'GCA', 'GCG'],
|
||||||
|
'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
|
||||||
|
'N': ['AAT', 'AAC'],
|
||||||
|
'D': ['GAT', 'GAC'],
|
||||||
|
'C': ['TGT', 'TGC'],
|
||||||
|
'E': ['GAA', 'GAG'],
|
||||||
|
'Q': ['CAA', 'CAG'],
|
||||||
|
'G': ['GGT', 'GGC', 'GGA', 'GGG'],
|
||||||
|
'H': ['CAT', 'CAC'],
|
||||||
|
'I': ['ATT', 'ATC', 'ATA'],
|
||||||
|
'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
|
||||||
|
'K': ['AAA', 'AAG'],
|
||||||
|
'M': ['ATG'],
|
||||||
|
'F': ['TTT', 'TTC'],
|
||||||
|
'P': ['CCT', 'CCC', 'CCA', 'CCG'],
|
||||||
|
'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
|
||||||
|
'T': ['ACT', 'ACC', 'ACA', 'ACG'],
|
||||||
|
'W': ['TGG'],
|
||||||
|
'Y': ['TAT', 'TAC'],
|
||||||
|
'V': ['GTT', 'GTC', 'GTA', 'GTG'],
|
||||||
|
'*': ['TAA', 'TAG', 'TGA'],
|
||||||
|
}
|
||||||
|
|
||||||
|
codes = []
|
||||||
|
for val in genetic_code.values():
|
||||||
|
codes += val
|
||||||
|
|
||||||
|
return set(codes) # genetic_code.get(amino_acid.upper(), [])
|
||||||
|
|
||||||
|
__CODONS__ = amino_acid_to_codon()
|
||||||
|
|
||||||
|
|
||||||
|
def reader(path: str, rt_len: int = 24):
|
||||||
|
"""
|
||||||
|
流式读取 CSV 文件,逐行返回 dict。
|
||||||
|
内存占用恒定(只缓存一行),适合 GB 级文件。
|
||||||
|
"""
|
||||||
|
with gzip.open(path, "rt", newline="") as f:
|
||||||
|
for row in csv.DictReader(f):
|
||||||
|
try:
|
||||||
|
if float(row["RTlength"]) <= rt_len:
|
||||||
|
yield row
|
||||||
|
except TypeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
def __check_target__(key: str):
|
||||||
|
if ">" in key:
|
||||||
|
key = key.replace(">", "_")
|
||||||
|
|
||||||
|
keys = key.split("_")
|
||||||
|
|
||||||
|
return keys[-1] in __CODONS__
|
||||||
|
|
||||||
|
|
||||||
|
def __decode_codon_n__(key: str) -> str:
|
||||||
|
# BRIP1_AA580_CTC_CTT
|
||||||
|
|
||||||
|
if ">" in key:
|
||||||
|
key = key.replace(">", "_")
|
||||||
|
|
||||||
|
keys = key.split("_")
|
||||||
|
|
||||||
|
try:
|
||||||
|
res = []
|
||||||
|
for x, y in zip(keys[-2], keys[-1]):
|
||||||
|
if x == y:
|
||||||
|
res.append(x)
|
||||||
|
else:
|
||||||
|
res.append("N")
|
||||||
|
keys[-1] = "".join(res)
|
||||||
|
except IndexError as err:
|
||||||
|
print(keys)
|
||||||
|
raise err
|
||||||
|
return "_".join(keys)
|
||||||
|
|
||||||
|
def __call_func__(args):
|
||||||
|
u""" 实际处理代码 """
|
||||||
|
f, outdir, top_n, degenerate = args
|
||||||
|
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
# 读取文件
|
||||||
|
for rec in tqdm(reader(f)):
|
||||||
|
|
||||||
|
# 根据设定好的sequence名称
|
||||||
|
key = rec["sequence_name"]
|
||||||
|
|
||||||
|
if not __check_target__(key):
|
||||||
|
# 如果target不是已知的编码氨基酸的codon则跳过
|
||||||
|
continue
|
||||||
|
|
||||||
|
if degenerate:
|
||||||
|
try:
|
||||||
|
key = __decode_codon_n__(rec["sequence_name"])
|
||||||
|
rec["orig_seq_name"] = rec.pop("sequence_name")
|
||||||
|
rec["sequence_name"] = key
|
||||||
|
except IndexError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if key not in data:
|
||||||
|
data[key] = []
|
||||||
|
|
||||||
|
# 数据heap化
|
||||||
|
if "DeepCas9score" in rec.keys():
|
||||||
|
k = "DeepCas9score"
|
||||||
|
elif "PRIDICT2_0_editing_Score_deep_K562" in rec.keys():
|
||||||
|
k = "PRIDICT2_0_editing_Score_deep_K562"
|
||||||
|
else:
|
||||||
|
print(f, rec)
|
||||||
|
continue
|
||||||
|
# raise ValueError(f"PRIDICT2_0_editing_Score_deep_K562 not exists in {f}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
score = float(rec[k])
|
||||||
|
except (ValueError, KeyError) as e:
|
||||||
|
print(f"Warning: Skipping invalid record in {f}: {rec}")
|
||||||
|
continue # 或 raise,根据需求
|
||||||
|
|
||||||
|
if len(data[key]) < top_n:
|
||||||
|
heapq.heappush(data[key], (score, rec))
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
if score > data[key][0][0]:
|
||||||
|
heapq.heapreplace(data[key], (score, rec))
|
||||||
|
except TypeError as err:
|
||||||
|
print(err)
|
||||||
|
print(key)
|
||||||
|
print(score)
|
||||||
|
print(len(data[key]))
|
||||||
|
raise err
|
||||||
|
|
||||||
|
# 第二遍:整理结果(按 score 降序)
|
||||||
|
final_records = []
|
||||||
|
for heap in data.values():
|
||||||
|
# 从堆中取出并按 score 降序排列
|
||||||
|
sorted_recs = [rec for _, rec in sorted(heap, key=lambda x: x[0], reverse=True)]
|
||||||
|
final_records.extend(sorted_recs)
|
||||||
|
|
||||||
|
if not final_records:
|
||||||
|
print(f"No valid records in {f}, skipping output.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 安全写入 CSV(使用 csv 模块)
|
||||||
|
output_path = os.path.join(outdir, os.path.basename(f))
|
||||||
|
with gzip.open(output_path, "wt+", newline="", encoding="utf-8") as w:
|
||||||
|
writer = csv.DictWriter(w, fieldnames=final_records[0].keys(), quoting=csv.QUOTE_MINIMAL)
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(final_records)
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.option("-i", "--indir", type=str, help="字符串形式的输入路径,可以*通配多个文件和目录")
|
||||||
|
@click.option("-o", "--outdir", type=str, help="输出目录")
|
||||||
|
@click.option("-t", "--top-n", type=int, help="选择前几", default=3)
|
||||||
|
@click.option("-n", "--degenerate", is_flag=True, help="是否使用兼并碱基")
|
||||||
|
@click.argument('args', nargs=-1) # 捕获所有位置参数
|
||||||
|
def main(indir, outdir, top_n, degenerate, args):
|
||||||
|
|
||||||
|
if not indir and len(args) > 0:
|
||||||
|
indir = args[0]
|
||||||
|
if not outdir and len(args) > 0:
|
||||||
|
outdir = args[-1]
|
||||||
|
|
||||||
|
if indir == outdir:
|
||||||
|
raise ValueError("indir and outdir should not be the same")
|
||||||
|
|
||||||
|
os.makedirs(outdir, exist_ok=True)
|
||||||
|
|
||||||
|
# 获取输入文件,生成参数
|
||||||
|
args = [[f, outdir, top_n, degenerate] for f in glob(indir)]
|
||||||
|
|
||||||
|
# for arg in args:
|
||||||
|
# print(arg[0])
|
||||||
|
# __call_func__(arg)
|
||||||
|
|
||||||
|
with Pool(len(args)) as p:
|
||||||
|
list(tqdm(p.imap(__call_func__, args), total=len(args)))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
||||||
0
interactive/README.md
Normal file
0
interactive/README.md
Normal file
222
interactive/db.py
Normal file
222
interactive/db.py
Normal file
@@ -0,0 +1,222 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import peewee as pw
|
||||||
|
import re
|
||||||
|
import csv
|
||||||
|
import gzip
|
||||||
|
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
|
||||||
|
db = pw.SqliteDatabase("./pegrna.db")
|
||||||
|
|
||||||
|
|
||||||
|
class BaseModel(pw.Model):
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
database = db
|
||||||
|
|
||||||
|
|
||||||
|
KEY_MAP = {
|
||||||
|
"pridict2": {
|
||||||
|
"sequence_name": "sequence",
|
||||||
|
"EditedAllele": "dst",
|
||||||
|
"OriginalAllele": "src",
|
||||||
|
"PRIDICT2_0_editing_Score_deep_K562": "k562",
|
||||||
|
"PRIDICT2_0_editing_Score_deep_HEK": "hek",
|
||||||
|
"K562_rank": "k562_rank",
|
||||||
|
"HEK_rank": "hek_rank",
|
||||||
|
"PRIDICT2_Format": "template",
|
||||||
|
"Target-Strand": "strand",
|
||||||
|
"PBSlength": "pbs_len",
|
||||||
|
"RToverhanglength": "rtt_oh_len",
|
||||||
|
"RTlength": "rtt_len",
|
||||||
|
"Spacer-Sequence": "spacer",
|
||||||
|
"Scaffold_Optimized": "scaffold",
|
||||||
|
"pegRNA": "pegrna",
|
||||||
|
"PBSrevcomp": "pbs",
|
||||||
|
"RTseqoverhangrevcomp": "rtt_oh",
|
||||||
|
"RTrevcomp": "rtt",
|
||||||
|
},
|
||||||
|
"prime_design": {
|
||||||
|
"Target_name": "sequence",
|
||||||
|
# "": "dst",
|
||||||
|
# "": "src",
|
||||||
|
"Target_sequence": "template",
|
||||||
|
"Strand": "strand",
|
||||||
|
"PBS_length": "pbs_len",
|
||||||
|
"RTT_length": "rtt_len",
|
||||||
|
"Spacer_sequence": "spacer",
|
||||||
|
"PAM_sequence": "pam",
|
||||||
|
"Extension_sequence": "extension", # RTT + PBS
|
||||||
|
"Spacer_sequence_order_TOP": "before_spacer",
|
||||||
|
"Spacer_sequence_order_BOTTOM": "after_spacer",
|
||||||
|
"pegRNA_extension_sequence_order_TOP": "before_pegnra_ext",
|
||||||
|
"pegRNA_extension_sequence_order_BOTTOM": "after_pegnra_ext",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def bulk_insert(table, data, chunk = 100):
|
||||||
|
with db.atomic():
|
||||||
|
for i in range(0, len(data), chunk):
|
||||||
|
table.insert_many(data[i:i + chunk]).execute()
|
||||||
|
|
||||||
|
|
||||||
|
class Pridict2(BaseModel):
|
||||||
|
gene = pw.CharField()
|
||||||
|
aa = pw.IntegerField()
|
||||||
|
|
||||||
|
sequence = pw.CharField()
|
||||||
|
|
||||||
|
src = pw.CharField()
|
||||||
|
dst = pw.CharField()
|
||||||
|
|
||||||
|
k562 = pw.FloatField()
|
||||||
|
hek = pw.FloatField()
|
||||||
|
|
||||||
|
k562_rank = pw.IntegerField()
|
||||||
|
hek_rank = pw.IntegerField()
|
||||||
|
|
||||||
|
template = pw.CharField()
|
||||||
|
strand = pw.CharField()
|
||||||
|
|
||||||
|
pbs_len = pw.IntegerField()
|
||||||
|
rtt_oh_len = pw.IntegerField()
|
||||||
|
rtt_len = pw.IntegerField()
|
||||||
|
|
||||||
|
spacer = pw.CharField()
|
||||||
|
scaffold = pw.CharField()
|
||||||
|
pegrna = pw.CharField()
|
||||||
|
pbs = pw.CharField()
|
||||||
|
rtt_oh = pw.CharField()
|
||||||
|
rtt = pw.CharField()
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
table_name = "pridict2"
|
||||||
|
|
||||||
|
|
||||||
|
class PrimeDesign(BaseModel):
|
||||||
|
gene = pw.CharField()
|
||||||
|
aa = pw.IntegerField()
|
||||||
|
sequence = pw.CharField()
|
||||||
|
src = pw.CharField()
|
||||||
|
dst = pw.CharField()
|
||||||
|
|
||||||
|
template = pw.CharField()
|
||||||
|
strand = pw.CharField()
|
||||||
|
|
||||||
|
pbs_len = pw.IntegerField()
|
||||||
|
rtt_len = pw.IntegerField()
|
||||||
|
|
||||||
|
pam = pw.CharField()
|
||||||
|
spacer = pw.CharField()
|
||||||
|
extension = pw.CharField()
|
||||||
|
pbs = pw.CharField()
|
||||||
|
rtt = pw.CharField()
|
||||||
|
before_spacer = pw.CharField()
|
||||||
|
after_spacer = pw.CharField()
|
||||||
|
|
||||||
|
before_pegnra_ext = pw.CharField()
|
||||||
|
after_pegnra_ext= pw.CharField()
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
table_name = "prime_design"
|
||||||
|
|
||||||
|
|
||||||
|
def format_data(value: Dict, mapping: Dict[str, str]) -> Dict[str, any]:
|
||||||
|
res = {}
|
||||||
|
for key, value in value.items():
|
||||||
|
if key in mapping.keys():
|
||||||
|
res[mapping[key]] = value
|
||||||
|
|
||||||
|
if not res.get("src"):
|
||||||
|
res["src"] = res["sequence"].split("_")[-2]
|
||||||
|
res["dst"] = res["sequence"].split("_")[-1]
|
||||||
|
|
||||||
|
res["aa"] = int(re.sub(r"\D", "", res["sequence"].split("_")[1]))
|
||||||
|
res["gene"] = res["sequence"].split("_")[0]
|
||||||
|
|
||||||
|
if not res.get("pbs") and res.get("extension") and res.get("pbs_len") and res.get("rtt_len"):
|
||||||
|
if len(res["extension"]) == int(res["pbs_len"]) + int(res["rtt_len"]):
|
||||||
|
res["pbs"] = res["extension"][:int(res["pbs_len"])]
|
||||||
|
res["rtt"] = res["extension"][int(res["pbs_len"]):]
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def insert(path: str, kind: str = "PRIDICT2", chunk: int = 10000):
|
||||||
|
|
||||||
|
if not Pridict2.table_exists():
|
||||||
|
Pridict2.create_table()
|
||||||
|
|
||||||
|
if not PrimeDesign.table_exists():
|
||||||
|
PrimeDesign.create_table()
|
||||||
|
|
||||||
|
|
||||||
|
kind = kind.lower()
|
||||||
|
|
||||||
|
assert kind in KEY_MAP.keys()
|
||||||
|
|
||||||
|
data = []
|
||||||
|
rows = 0
|
||||||
|
with gzip.open(path, 'rt', encoding='utf-8') as file:
|
||||||
|
csv_dict_reader = csv.DictReader(file)
|
||||||
|
|
||||||
|
# 逐行读取,每行是一个字典
|
||||||
|
for row in csv_dict_reader:
|
||||||
|
# 通过列名访问数据
|
||||||
|
data.append(format_data(row, KEY_MAP[kind]))
|
||||||
|
rows += 1
|
||||||
|
if len(data) >= chunk:
|
||||||
|
print(f"finished {rows} rows")
|
||||||
|
bulk_insert(Pridict2 if kind == "pridict2" else PrimeDesign, data)
|
||||||
|
data = []
|
||||||
|
|
||||||
|
if data:
|
||||||
|
bulk_insert(Pridict2 if kind == "pridict2" else PrimeDesign, data)
|
||||||
|
|
||||||
|
|
||||||
|
def index():
|
||||||
|
# 创建简单索引
|
||||||
|
|
||||||
|
for i in [
|
||||||
|
Pridict2.gene,
|
||||||
|
Pridict2.aa,
|
||||||
|
Pridict2.sequence,
|
||||||
|
Pridict2.dst,
|
||||||
|
Pridict2.src,
|
||||||
|
|
||||||
|
Pridict2.k562,
|
||||||
|
Pridict2.hek,
|
||||||
|
|
||||||
|
Pridict2.pbs_len,
|
||||||
|
Pridict2.rtt_len,
|
||||||
|
]:
|
||||||
|
print(Pridict2.__name__, i.name)
|
||||||
|
sql = f"CREATE INDEX IF NOT EXISTS {Pridict2.__name__}_{i.name}_idx ON pridict2 ({i.name});"
|
||||||
|
db.execute_sql(sql)
|
||||||
|
|
||||||
|
|
||||||
|
for i in [
|
||||||
|
PrimeDesign.gene,
|
||||||
|
PrimeDesign.aa,
|
||||||
|
PrimeDesign.sequence,
|
||||||
|
PrimeDesign.dst,
|
||||||
|
PrimeDesign.src,
|
||||||
|
|
||||||
|
PrimeDesign.pbs_len,
|
||||||
|
PrimeDesign.rtt_len,
|
||||||
|
]:
|
||||||
|
print(PrimeDesign.__name__, i.name)
|
||||||
|
sql = f"CREATE INDEX IF NOT EXISTS {PrimeDesign.__name__}_{i.name}_idx ON prime_design ({i.name});"
|
||||||
|
db.execute_sql(sql)
|
||||||
|
|
||||||
|
|
||||||
|
def table_columns(table):
|
||||||
|
return {x: y for x, y in table.__dict__.items() if "__" not in x}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print(table_columns(Pridict2))
|
||||||
|
pass
|
||||||
|
|
||||||
5
interactive/frontend/README.md
Normal file
5
interactive/frontend/README.md
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
# Vue 3 + TypeScript + Vite
|
||||||
|
|
||||||
|
This template should help get you started developing with Vue 3 and TypeScript in Vite. The template uses Vue 3 `<script setup>` SFCs, check out the [script setup docs](https://v3.vuejs.org/api/sfc-script-setup.html#sfc-script-setup) to learn more.
|
||||||
|
|
||||||
|
Learn more about the recommended Project Setup and IDE Support in the [Vue Docs TypeScript Guide](https://vuejs.org/guide/typescript/overview.html#project-setup).
|
||||||
77
interactive/frontend/auto-imports.d.ts
vendored
Normal file
77
interactive/frontend/auto-imports.d.ts
vendored
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
/* eslint-disable */
|
||||||
|
/* prettier-ignore */
|
||||||
|
// @ts-nocheck
|
||||||
|
// noinspection JSUnusedGlobalSymbols
|
||||||
|
// Generated by unplugin-auto-import
|
||||||
|
// biome-ignore lint: disable
|
||||||
|
export {}
|
||||||
|
declare global {
|
||||||
|
const EffectScope: typeof import('vue').EffectScope
|
||||||
|
const computed: typeof import('vue').computed
|
||||||
|
const createApp: typeof import('vue').createApp
|
||||||
|
const customRef: typeof import('vue').customRef
|
||||||
|
const defineAsyncComponent: typeof import('vue').defineAsyncComponent
|
||||||
|
const defineComponent: typeof import('vue').defineComponent
|
||||||
|
const effectScope: typeof import('vue').effectScope
|
||||||
|
const getCurrentInstance: typeof import('vue').getCurrentInstance
|
||||||
|
const getCurrentScope: typeof import('vue').getCurrentScope
|
||||||
|
const getCurrentWatcher: typeof import('vue').getCurrentWatcher
|
||||||
|
const h: typeof import('vue').h
|
||||||
|
const inject: typeof import('vue').inject
|
||||||
|
const isProxy: typeof import('vue').isProxy
|
||||||
|
const isReactive: typeof import('vue').isReactive
|
||||||
|
const isReadonly: typeof import('vue').isReadonly
|
||||||
|
const isRef: typeof import('vue').isRef
|
||||||
|
const isShallow: typeof import('vue').isShallow
|
||||||
|
const markRaw: typeof import('vue').markRaw
|
||||||
|
const nextTick: typeof import('vue').nextTick
|
||||||
|
const onActivated: typeof import('vue').onActivated
|
||||||
|
const onBeforeMount: typeof import('vue').onBeforeMount
|
||||||
|
const onBeforeUnmount: typeof import('vue').onBeforeUnmount
|
||||||
|
const onBeforeUpdate: typeof import('vue').onBeforeUpdate
|
||||||
|
const onDeactivated: typeof import('vue').onDeactivated
|
||||||
|
const onErrorCaptured: typeof import('vue').onErrorCaptured
|
||||||
|
const onMounted: typeof import('vue').onMounted
|
||||||
|
const onRenderTracked: typeof import('vue').onRenderTracked
|
||||||
|
const onRenderTriggered: typeof import('vue').onRenderTriggered
|
||||||
|
const onScopeDispose: typeof import('vue').onScopeDispose
|
||||||
|
const onServerPrefetch: typeof import('vue').onServerPrefetch
|
||||||
|
const onUnmounted: typeof import('vue').onUnmounted
|
||||||
|
const onUpdated: typeof import('vue').onUpdated
|
||||||
|
const onWatcherCleanup: typeof import('vue').onWatcherCleanup
|
||||||
|
const provide: typeof import('vue').provide
|
||||||
|
const reactive: typeof import('vue').reactive
|
||||||
|
const readonly: typeof import('vue').readonly
|
||||||
|
const ref: typeof import('vue').ref
|
||||||
|
const resolveComponent: typeof import('vue').resolveComponent
|
||||||
|
const shallowReactive: typeof import('vue').shallowReactive
|
||||||
|
const shallowReadonly: typeof import('vue').shallowReadonly
|
||||||
|
const shallowRef: typeof import('vue').shallowRef
|
||||||
|
const toRaw: typeof import('vue').toRaw
|
||||||
|
const toRef: typeof import('vue').toRef
|
||||||
|
const toRefs: typeof import('vue').toRefs
|
||||||
|
const toValue: typeof import('vue').toValue
|
||||||
|
const triggerRef: typeof import('vue').triggerRef
|
||||||
|
const unref: typeof import('vue').unref
|
||||||
|
const useAttrs: typeof import('vue').useAttrs
|
||||||
|
const useCssModule: typeof import('vue').useCssModule
|
||||||
|
const useCssVars: typeof import('vue').useCssVars
|
||||||
|
const useDialog: typeof import('naive-ui').useDialog
|
||||||
|
const useId: typeof import('vue').useId
|
||||||
|
const useLoadingBar: typeof import('naive-ui').useLoadingBar
|
||||||
|
const useMessage: typeof import('naive-ui').useMessage
|
||||||
|
const useModel: typeof import('vue').useModel
|
||||||
|
const useNotification: typeof import('naive-ui').useNotification
|
||||||
|
const useSlots: typeof import('vue').useSlots
|
||||||
|
const useTemplateRef: typeof import('vue').useTemplateRef
|
||||||
|
const watch: typeof import('vue').watch
|
||||||
|
const watchEffect: typeof import('vue').watchEffect
|
||||||
|
const watchPostEffect: typeof import('vue').watchPostEffect
|
||||||
|
const watchSyncEffect: typeof import('vue').watchSyncEffect
|
||||||
|
}
|
||||||
|
// for type re-export
|
||||||
|
declare global {
|
||||||
|
// @ts-ignore
|
||||||
|
export type { Component, Slot, Slots, ComponentPublicInstance, ComputedRef, DirectiveBinding, ExtractDefaultPropTypes, ExtractPropTypes, ExtractPublicPropTypes, InjectionKey, PropType, Ref, ShallowRef, MaybeRef, MaybeRefOrGetter, VNode, WritableComputedRef } from 'vue'
|
||||||
|
import('vue')
|
||||||
|
}
|
||||||
30
interactive/frontend/components.d.ts
vendored
Normal file
30
interactive/frontend/components.d.ts
vendored
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
/* eslint-disable */
|
||||||
|
// @ts-nocheck
|
||||||
|
// biome-ignore lint: disable
|
||||||
|
// oxlint-disable
|
||||||
|
// ------
|
||||||
|
// Generated by unplugin-vue-components
|
||||||
|
// Read more: https://github.com/vuejs/core/pull/3399
|
||||||
|
|
||||||
|
export {}
|
||||||
|
|
||||||
|
/* prettier-ignore */
|
||||||
|
declare module 'vue' {
|
||||||
|
export interface GlobalComponents {
|
||||||
|
HelloWorld: typeof import('./src/components/HelloWorld.vue')['default']
|
||||||
|
NConfigProvider: typeof import('naive-ui')['NConfigProvider']
|
||||||
|
NDataTable: typeof import('naive-ui')['NDataTable']
|
||||||
|
NFlex: typeof import('naive-ui')['NFlex']
|
||||||
|
NForm: typeof import('naive-ui')['NForm']
|
||||||
|
NFormItem: typeof import('naive-ui')['NFormItem']
|
||||||
|
NGi: typeof import('naive-ui')['NGi']
|
||||||
|
NGrid: typeof import('naive-ui')['NGrid']
|
||||||
|
NInputNumber: typeof import('naive-ui')['NInputNumber']
|
||||||
|
NLayout: typeof import('naive-ui')['NLayout']
|
||||||
|
NLayoutContent: typeof import('naive-ui')['NLayoutContent']
|
||||||
|
NLayoutHeader: typeof import('naive-ui')['NLayoutHeader']
|
||||||
|
NMessageProvider: typeof import('naive-ui')['NMessageProvider']
|
||||||
|
NPagination: typeof import('naive-ui')['NPagination']
|
||||||
|
NSelect: typeof import('naive-ui')['NSelect']
|
||||||
|
}
|
||||||
|
}
|
||||||
13
interactive/frontend/index.html
Normal file
13
interactive/frontend/index.html
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8" />
|
||||||
|
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
|
<title>frontend</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div id="app"></div>
|
||||||
|
<script type="module" src="/src/main.ts"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
26
interactive/frontend/package.json
Normal file
26
interactive/frontend/package.json
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"name": "frontend",
|
||||||
|
"private": true,
|
||||||
|
"version": "0.0.0",
|
||||||
|
"type": "module",
|
||||||
|
"scripts": {
|
||||||
|
"dev": "vite",
|
||||||
|
"build": "vue-tsc -b && vite build",
|
||||||
|
"preview": "vite preview"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"axios": "^1.13.5",
|
||||||
|
"naive-ui": "^2.43.2",
|
||||||
|
"unplugin-auto-import": "^21.0.0",
|
||||||
|
"unplugin-vue-components": "^31.0.0",
|
||||||
|
"vue": "^3.5.25"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/node": "^24.10.1",
|
||||||
|
"@vitejs/plugin-vue": "^6.0.2",
|
||||||
|
"@vue/tsconfig": "^0.8.1",
|
||||||
|
"typescript": "~5.9.3",
|
||||||
|
"vite": "^7.3.1",
|
||||||
|
"vue-tsc": "^3.1.5"
|
||||||
|
}
|
||||||
|
}
|
||||||
1554
interactive/frontend/pnpm-lock.yaml
generated
Normal file
1554
interactive/frontend/pnpm-lock.yaml
generated
Normal file
File diff suppressed because it is too large
Load Diff
1
interactive/frontend/public/vite.svg
Normal file
1
interactive/frontend/public/vite.svg
Normal file
@@ -0,0 +1 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="iconify iconify--logos" width="31.88" height="32" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 257"><defs><linearGradient id="IconifyId1813088fe1fbc01fb466" x1="-.828%" x2="57.636%" y1="7.652%" y2="78.411%"><stop offset="0%" stop-color="#41D1FF"></stop><stop offset="100%" stop-color="#BD34FE"></stop></linearGradient><linearGradient id="IconifyId1813088fe1fbc01fb467" x1="43.376%" x2="50.316%" y1="2.242%" y2="89.03%"><stop offset="0%" stop-color="#FFEA83"></stop><stop offset="8.333%" stop-color="#FFDD35"></stop><stop offset="100%" stop-color="#FFA800"></stop></linearGradient></defs><path fill="url(#IconifyId1813088fe1fbc01fb466)" d="M255.153 37.938L134.897 252.976c-2.483 4.44-8.862 4.466-11.382.048L.875 37.958c-2.746-4.814 1.371-10.646 6.827-9.67l120.385 21.517a6.537 6.537 0 0 0 2.322-.004l117.867-21.483c5.438-.991 9.574 4.796 6.877 9.62Z"></path><path fill="url(#IconifyId1813088fe1fbc01fb467)" d="M185.432.063L96.44 17.501a3.268 3.268 0 0 0-2.634 3.014l-5.474 92.456a3.268 3.268 0 0 0 3.997 3.378l24.777-5.718c2.318-.535 4.413 1.507 3.936 3.838l-7.361 36.047c-.495 2.426 1.782 4.5 4.151 3.78l15.304-4.649c2.372-.72 4.652 1.36 4.15 3.788l-11.698 56.621c-.732 3.542 3.979 5.473 5.943 2.437l1.313-2.028l72.516-144.72c1.215-2.423-.88-5.186-3.54-4.672l-25.505 4.922c-2.396.462-4.435-1.77-3.759-4.114l16.646-57.705c.677-2.35-1.37-4.583-3.769-4.113Z"></path></svg>
|
||||||
|
After Width: | Height: | Size: 1.5 KiB |
26
interactive/frontend/src/App.vue
Normal file
26
interactive/frontend/src/App.vue
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
<script setup lang="ts">
|
||||||
|
import HelloWorld from './components/HelloWorld.vue'
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<template>
|
||||||
|
<n-config-provider>
|
||||||
|
<n-message-provider>
|
||||||
|
<HelloWorld />
|
||||||
|
</n-message-provider>
|
||||||
|
</n-config-provider>
|
||||||
|
</template>
|
||||||
|
|
||||||
|
<style scoped>
|
||||||
|
.logo {
|
||||||
|
height: 6em;
|
||||||
|
padding: 1.5em;
|
||||||
|
will-change: filter;
|
||||||
|
transition: filter 300ms;
|
||||||
|
}
|
||||||
|
.logo:hover {
|
||||||
|
filter: drop-shadow(0 0 2em #646cffaa);
|
||||||
|
}
|
||||||
|
.logo.vue:hover {
|
||||||
|
filter: drop-shadow(0 0 2em #42b883aa);
|
||||||
|
}
|
||||||
|
</style>
|
||||||
1
interactive/frontend/src/assets/vue.svg
Normal file
1
interactive/frontend/src/assets/vue.svg
Normal file
@@ -0,0 +1 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="iconify iconify--logos" width="37.07" height="36" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 198"><path fill="#41B883" d="M204.8 0H256L128 220.8L0 0h97.92L128 51.2L157.44 0h47.36Z"></path><path fill="#41B883" d="m0 0l128 220.8L256 0h-51.2L128 132.48L50.56 0H0Z"></path><path fill="#35495E" d="M50.56 0L128 133.12L204.8 0h-47.36L128 51.2L97.92 0H50.56Z"></path></svg>
|
||||||
|
After Width: | Height: | Size: 496 B |
375
interactive/frontend/src/components/HelloWorld.vue
Normal file
375
interactive/frontend/src/components/HelloWorld.vue
Normal file
@@ -0,0 +1,375 @@
|
|||||||
|
<script setup lang="ts">
|
||||||
|
import {onMounted, ref, h, watch} from 'vue'
|
||||||
|
import axios, { AxiosError, type AxiosResponse } from "axios";
|
||||||
|
import type {SelectOption, DataTableSortState} from "naive-ui"
|
||||||
|
import { useMessage, NEllipsis, NButton, NPopover } from "naive-ui"
|
||||||
|
|
||||||
|
const BASE_URL = "/api" // http://10.126.126.11:5555/api
|
||||||
|
|
||||||
|
const APIs = {
|
||||||
|
unique: `${BASE_URL}/gene`,
|
||||||
|
content: `${BASE_URL}/records`,
|
||||||
|
};
|
||||||
|
|
||||||
|
const message = useMessage();
|
||||||
|
const genes = ref<SelectOption[]|null>(null);
|
||||||
|
const tables = [
|
||||||
|
{"value": "pridict2", "label": "Pridict2"},
|
||||||
|
{"value": "prime_design", "label": "Prime Design"},
|
||||||
|
]
|
||||||
|
|
||||||
|
interface FormData {
|
||||||
|
gene: string | null
|
||||||
|
source: string
|
||||||
|
pbs_len: number | null
|
||||||
|
rtt_len: number | null
|
||||||
|
}
|
||||||
|
|
||||||
|
const formData = ref<FormData>({
|
||||||
|
source: "pridict2",
|
||||||
|
gene: null,
|
||||||
|
pbs_len: 0,
|
||||||
|
rtt_len: 0,
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
interface Pagination {
|
||||||
|
order: string;
|
||||||
|
order_by: string;
|
||||||
|
total: number;
|
||||||
|
page: number;
|
||||||
|
length: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
const pagination = ref<Pagination>({
|
||||||
|
order: "desc",
|
||||||
|
order_by: "gene",
|
||||||
|
total: 10,
|
||||||
|
length: 10,
|
||||||
|
page: 1,
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
function processSorter(
|
||||||
|
options: DataTableSortState | DataTableSortState[] | null,
|
||||||
|
) {
|
||||||
|
if (options !== null) {
|
||||||
|
options = options as DataTableSortState;
|
||||||
|
pagination.value.order_by = options.columnKey.toString();
|
||||||
|
pagination.value.order =
|
||||||
|
typeof options.order === "boolean" ? "asc" : options.order;
|
||||||
|
} else {
|
||||||
|
pagination.value.order_by = "id";
|
||||||
|
pagination.value.order = "descend";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
interface RowData {
|
||||||
|
id: number;
|
||||||
|
gene: string
|
||||||
|
aa: number
|
||||||
|
sequence: string
|
||||||
|
src: string
|
||||||
|
dst: string
|
||||||
|
k562: number|null
|
||||||
|
hek: number|null
|
||||||
|
k562_rank: number|null
|
||||||
|
hek_rank: number|null
|
||||||
|
template: string
|
||||||
|
strand: string
|
||||||
|
pbs_len: number
|
||||||
|
rtt_len: number
|
||||||
|
spacer: string|null
|
||||||
|
scaffold: string|null
|
||||||
|
pegrna: string|null
|
||||||
|
pbs: string
|
||||||
|
rtt: string
|
||||||
|
extension: string|null
|
||||||
|
before_spacer: string|null
|
||||||
|
after_spacer: string|null
|
||||||
|
before_pegnra_ext: string|null
|
||||||
|
after_pegnra_ext: string|null
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
let columns = [
|
||||||
|
{
|
||||||
|
title: "Gene", key: "gene", defaultSortOrder: "descend", width: 60, resizable: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
title: "AA (n)", key: "aa", defaultSortOrder: "ascend", width: 50, resizable: true, sorter: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
title: "name", key: "sequence", defaultSortOrder: "ascend", width: 100, resizable: true, sorter: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
title: "原序列", key: "src", defaultSortOrder: "ascend", width: 60, resizable: true, sorter: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
title: "编辑后", key: "dst", defaultSortOrder: "ascend", width: 60, resizable: true, sorter: true,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
let post_columns = [
|
||||||
|
{title: "strand", key: "strand", defaultSortOrder: "ascend", maxWidth: 20, resizable: true,},
|
||||||
|
{title: "PBS len", key: "pbs_len", defaultSortOrder: "ascend", maxWidth: 20, resizable: true, sorter: true,},
|
||||||
|
{title: "RTT len", key: "rtt_len", defaultSortOrder: "ascend", maxWidth: 20, resizable: true, sorter: true,},
|
||||||
|
{title: "PBS", key: "pbs", defaultSortOrder: "ascend", width: 120, resizable: true,},
|
||||||
|
{title: "RTT", key: "rtt", defaultSortOrder: "ascend", width: 120, resizable: true,}
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
const copyText = (text: string) => {
|
||||||
|
const textArea = document.createElement("textarea");
|
||||||
|
textArea.value = text;
|
||||||
|
|
||||||
|
textArea.style.top = "0";
|
||||||
|
textArea.style.left = "0";
|
||||||
|
textArea.style.position = "fixed";
|
||||||
|
|
||||||
|
document.body.appendChild(textArea);
|
||||||
|
textArea.focus();
|
||||||
|
textArea.select();
|
||||||
|
|
||||||
|
try {
|
||||||
|
document.execCommand("copy");
|
||||||
|
document.body.removeChild(textArea);
|
||||||
|
return Promise.resolve();
|
||||||
|
} catch (err) {
|
||||||
|
document.body.removeChild(textArea);
|
||||||
|
return Promise.reject(err);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
const createColumns = () => {
|
||||||
|
|
||||||
|
let real_columns = [...columns]
|
||||||
|
|
||||||
|
let rest_columns = ["spacer", "extension", "before_spacer", "after_spacer", "before_pegnra_ext", "after_pegnra_ext"]
|
||||||
|
if (formData.value.source === "pridict2") {
|
||||||
|
rest_columns = ["template", "spacer", "scaffold", "pegrna"]
|
||||||
|
|
||||||
|
for (let i of ["k562", "hek"]) {
|
||||||
|
real_columns.push(
|
||||||
|
{
|
||||||
|
title: i, key: i, width: 60, resizable: true, sorter: true,
|
||||||
|
render: function(row: RowData) {
|
||||||
|
return h(
|
||||||
|
NPopover, {
|
||||||
|
trigger: "hover"
|
||||||
|
}, {
|
||||||
|
trigger: () => `${parseFloat(row[i].toFixed(2))} (${row[i + "_rank"]})`,
|
||||||
|
default: () => {`Score: ${row[i]}; Rank=${row[i + "_rank"]}`}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
real_columns = real_columns.concat(post_columns)
|
||||||
|
|
||||||
|
for (let i of rest_columns) {
|
||||||
|
real_columns.push({
|
||||||
|
title:i,
|
||||||
|
key: i,
|
||||||
|
width: 240,
|
||||||
|
resizable: true,
|
||||||
|
render: (row: RowData) => {
|
||||||
|
return h(
|
||||||
|
NButton, {
|
||||||
|
size:"small", type: "primary", dashed: true,
|
||||||
|
onClick: () => {copyText(row[i])}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
default: () => {
|
||||||
|
return h(
|
||||||
|
NEllipsis, {
|
||||||
|
style: "max-width: 200px",
|
||||||
|
tooltip: {
|
||||||
|
style: {
|
||||||
|
maxWidth: '300px',
|
||||||
|
whiteSpace: 'pre-wrap', // 关键:允许换行
|
||||||
|
wordBreak: 'break-word' // 允许单词内换行
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{default: () => row[i]}
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return real_columns
|
||||||
|
};
|
||||||
|
const loading = ref(false)
|
||||||
|
|
||||||
|
const data = ref<RowData[]>([]);
|
||||||
|
|
||||||
|
const getRecords = () => {
|
||||||
|
loading.value = true;
|
||||||
|
let params = {
|
||||||
|
gene: formData.value.gene,
|
||||||
|
pbs_len: formData.value.pbs_len,
|
||||||
|
rtt_len: formData.value.rtt_len,
|
||||||
|
source: formData.value.source,
|
||||||
|
offset: pagination.value.page,
|
||||||
|
length: pagination.value.length,
|
||||||
|
order_by: pagination.value.order_by,
|
||||||
|
order: pagination.value.order,
|
||||||
|
}
|
||||||
|
axios
|
||||||
|
.get(APIs.content, { params: params })
|
||||||
|
.then((response: AxiosResponse) => {
|
||||||
|
let resp = response.data;
|
||||||
|
data.value = resp.data;
|
||||||
|
|
||||||
|
if (resp.total !== pagination.value.total) {
|
||||||
|
pagination.value.total = resp.total;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (resp.length !== pagination.value.length) {
|
||||||
|
pagination.value.length = resp.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pagination.value.page > Math.ceil(resp.total / resp.length)) {
|
||||||
|
pagination.value.page = Math.ceil(resp.total / resp.length);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch((error: Error | AxiosError) => {
|
||||||
|
message.error(error.message);
|
||||||
|
}).finally(() => {
|
||||||
|
loading.value = false;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
onMounted(() => {
|
||||||
|
axios.get(APIs.unique).then((response: AxiosResponse) => {
|
||||||
|
let res = []
|
||||||
|
|
||||||
|
for (let i of response.data) {
|
||||||
|
res.push({"value": i, "label": i})
|
||||||
|
}
|
||||||
|
genes.value = res
|
||||||
|
formData.value.gene = res[0].value
|
||||||
|
}).finally(() => {
|
||||||
|
getRecords()
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
watch(
|
||||||
|
() => [formData, pagination],
|
||||||
|
(_) => {
|
||||||
|
axios.get(APIs.unique, {params: {source: formData.value.source}}).then((response: AxiosResponse) => {
|
||||||
|
let res = []
|
||||||
|
|
||||||
|
for (let i of response.data) {
|
||||||
|
res.push({"value": i, "label": i})
|
||||||
|
}
|
||||||
|
genes.value = res
|
||||||
|
}).finally(() => {
|
||||||
|
getRecords()
|
||||||
|
})
|
||||||
|
},
|
||||||
|
{ deep: true },
|
||||||
|
);
|
||||||
|
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<template>
|
||||||
|
<n-grid cols="24" :y-gap="8" item-responsive>
|
||||||
|
<n-gi span="0 400:1 800:2" responsive="self" />
|
||||||
|
<n-gi span="24 400:22 600:20" responsive="self">
|
||||||
|
<n-layout>
|
||||||
|
<n-layout-header style="min-height: 30px; padding: 10px" bordered>
|
||||||
|
<n-form label-placement="left">
|
||||||
|
<n-flex justify="space-around" style="margin-right: 10px">
|
||||||
|
<!-- 查询界面 -->
|
||||||
|
<n-grid cols="4" :x-gap="12" :y-gap="8" item-responsive>
|
||||||
|
<n-gi span="4 400:2 800:1" responsive="self">
|
||||||
|
<n-form-item label="表">
|
||||||
|
<n-select v-model:value="formData.source" :options="tables" filterable clearable/>
|
||||||
|
</n-form-item>
|
||||||
|
</n-gi>
|
||||||
|
<n-gi span="4 400:2 800:1" responsive="self">
|
||||||
|
<n-form-item label="基因">
|
||||||
|
<n-select v-model:value="formData.gene" :options="genes" filterable clearable/>
|
||||||
|
</n-form-item>
|
||||||
|
</n-gi>
|
||||||
|
<n-gi span="4 400:2 800:1" responsive="self">
|
||||||
|
<n-form-item label="PBS len <= ">
|
||||||
|
<n-input-number v-model:value="formData.pbs_len" clearable/>
|
||||||
|
</n-form-item>
|
||||||
|
</n-gi>
|
||||||
|
<n-gi span="4 400:2 800:1" responsive="self">
|
||||||
|
<n-form-item label="RTT len <= ">
|
||||||
|
<n-input-number v-model:value="formData.rtt_len" clearable/>
|
||||||
|
</n-form-item>
|
||||||
|
</n-gi>
|
||||||
|
</n-grid>
|
||||||
|
</n-flex>
|
||||||
|
</n-form>
|
||||||
|
</n-layout-header>
|
||||||
|
|
||||||
|
<n-layout-content style="padding-top: 10px; padding-left: 5px" bordered>
|
||||||
|
<n-flex justify="center">
|
||||||
|
<n-pagination
|
||||||
|
v-model:page="pagination.page"
|
||||||
|
:page-sizes="[10, 20, 30, 40]"
|
||||||
|
:item-count="pagination.total"
|
||||||
|
v-model:page-size="pagination.length"
|
||||||
|
show-quick-jumper
|
||||||
|
show-size-picker
|
||||||
|
style="padding: 5px"
|
||||||
|
/>
|
||||||
|
</n-flex>
|
||||||
|
|
||||||
|
<n-data-table
|
||||||
|
:columns="createColumns()"
|
||||||
|
:data="data"
|
||||||
|
:loading="loading"
|
||||||
|
:scroll-x="1800"
|
||||||
|
width="100%"
|
||||||
|
:max-height="600"
|
||||||
|
:row-key="
|
||||||
|
(row: RowData) => (row.id)
|
||||||
|
"
|
||||||
|
striped
|
||||||
|
bordered
|
||||||
|
@update:sorter="processSorter"
|
||||||
|
sticky-expanded-rows
|
||||||
|
/>
|
||||||
|
|
||||||
|
<n-flex justify="center">
|
||||||
|
<n-pagination
|
||||||
|
v-model:page="pagination.page"
|
||||||
|
:page-sizes="[10, 20, 30, 40]"
|
||||||
|
:item-count="pagination.total"
|
||||||
|
v-model:page-size="pagination.length"
|
||||||
|
show-quick-jumper
|
||||||
|
show-size-picker
|
||||||
|
style="padding: 5px"
|
||||||
|
/>
|
||||||
|
</n-flex>
|
||||||
|
</n-layout-content>
|
||||||
|
</n-layout>
|
||||||
|
</n-gi>
|
||||||
|
</n-grid>
|
||||||
|
</template>
|
||||||
|
|
||||||
|
<style scoped>
|
||||||
|
/* 关键:为表格容器设置固定宽度和溢出控制 */
|
||||||
|
.table-container {
|
||||||
|
width: 100%; /* 或者固定宽度,如 1200px */
|
||||||
|
overflow-x: auto; /* 确保容器可水平滚动 */
|
||||||
|
}
|
||||||
|
|
||||||
|
</style>
|
||||||
4
interactive/frontend/src/main.ts
Normal file
4
interactive/frontend/src/main.ts
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
import { createApp } from "vue";
|
||||||
|
import App from "./App.vue";
|
||||||
|
|
||||||
|
createApp(App).mount("#app");
|
||||||
79
interactive/frontend/src/style.css
Normal file
79
interactive/frontend/src/style.css
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
:root {
|
||||||
|
font-family: system-ui, Avenir, Helvetica, Arial, sans-serif;
|
||||||
|
line-height: 1.5;
|
||||||
|
font-weight: 400;
|
||||||
|
|
||||||
|
color-scheme: light dark;
|
||||||
|
color: rgba(255, 255, 255, 0.87);
|
||||||
|
background-color: #242424;
|
||||||
|
|
||||||
|
font-synthesis: none;
|
||||||
|
text-rendering: optimizeLegibility;
|
||||||
|
-webkit-font-smoothing: antialiased;
|
||||||
|
-moz-osx-font-smoothing: grayscale;
|
||||||
|
}
|
||||||
|
|
||||||
|
a {
|
||||||
|
font-weight: 500;
|
||||||
|
color: #646cff;
|
||||||
|
text-decoration: inherit;
|
||||||
|
}
|
||||||
|
a:hover {
|
||||||
|
color: #535bf2;
|
||||||
|
}
|
||||||
|
|
||||||
|
body {
|
||||||
|
margin: 0;
|
||||||
|
display: flex;
|
||||||
|
place-items: center;
|
||||||
|
min-width: 320px;
|
||||||
|
min-height: 100vh;
|
||||||
|
}
|
||||||
|
|
||||||
|
h1 {
|
||||||
|
font-size: 3.2em;
|
||||||
|
line-height: 1.1;
|
||||||
|
}
|
||||||
|
|
||||||
|
button {
|
||||||
|
border-radius: 8px;
|
||||||
|
border: 1px solid transparent;
|
||||||
|
padding: 0.6em 1.2em;
|
||||||
|
font-size: 1em;
|
||||||
|
font-weight: 500;
|
||||||
|
font-family: inherit;
|
||||||
|
background-color: #1a1a1a;
|
||||||
|
cursor: pointer;
|
||||||
|
transition: border-color 0.25s;
|
||||||
|
}
|
||||||
|
button:hover {
|
||||||
|
border-color: #646cff;
|
||||||
|
}
|
||||||
|
button:focus,
|
||||||
|
button:focus-visible {
|
||||||
|
outline: 4px auto -webkit-focus-ring-color;
|
||||||
|
}
|
||||||
|
|
||||||
|
.card {
|
||||||
|
padding: 2em;
|
||||||
|
}
|
||||||
|
|
||||||
|
#app {
|
||||||
|
max-width: 1280px;
|
||||||
|
margin: 0 auto;
|
||||||
|
padding: 2rem;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
@media (prefers-color-scheme: light) {
|
||||||
|
:root {
|
||||||
|
color: #213547;
|
||||||
|
background-color: #ffffff;
|
||||||
|
}
|
||||||
|
a:hover {
|
||||||
|
color: #747bff;
|
||||||
|
}
|
||||||
|
button {
|
||||||
|
background-color: #f9f9f9;
|
||||||
|
}
|
||||||
|
}
|
||||||
16
interactive/frontend/tsconfig.app.json
Normal file
16
interactive/frontend/tsconfig.app.json
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
{
|
||||||
|
"extends": "@vue/tsconfig/tsconfig.dom.json",
|
||||||
|
"compilerOptions": {
|
||||||
|
"tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo",
|
||||||
|
"types": ["vite/client"],
|
||||||
|
|
||||||
|
/* Linting */
|
||||||
|
"strict": true,
|
||||||
|
"noUnusedLocals": true,
|
||||||
|
"noUnusedParameters": true,
|
||||||
|
"erasableSyntaxOnly": true,
|
||||||
|
"noFallthroughCasesInSwitch": true,
|
||||||
|
"noUncheckedSideEffectImports": true
|
||||||
|
},
|
||||||
|
"include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.vue"]
|
||||||
|
}
|
||||||
7
interactive/frontend/tsconfig.json
Normal file
7
interactive/frontend/tsconfig.json
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
{
|
||||||
|
"files": [],
|
||||||
|
"references": [
|
||||||
|
{ "path": "./tsconfig.app.json" },
|
||||||
|
{ "path": "./tsconfig.node.json" }
|
||||||
|
]
|
||||||
|
}
|
||||||
26
interactive/frontend/tsconfig.node.json
Normal file
26
interactive/frontend/tsconfig.node.json
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
"tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo",
|
||||||
|
"target": "ES2023",
|
||||||
|
"lib": ["ES2023"],
|
||||||
|
"module": "ESNext",
|
||||||
|
"types": ["node"],
|
||||||
|
"skipLibCheck": true,
|
||||||
|
|
||||||
|
/* Bundler mode */
|
||||||
|
"moduleResolution": "bundler",
|
||||||
|
"allowImportingTsExtensions": true,
|
||||||
|
"verbatimModuleSyntax": true,
|
||||||
|
"moduleDetection": "force",
|
||||||
|
"noEmit": true,
|
||||||
|
|
||||||
|
/* Linting */
|
||||||
|
"strict": true,
|
||||||
|
"noUnusedLocals": true,
|
||||||
|
"noUnusedParameters": true,
|
||||||
|
"erasableSyntaxOnly": true,
|
||||||
|
"noFallthroughCasesInSwitch": true,
|
||||||
|
"noUncheckedSideEffectImports": true
|
||||||
|
},
|
||||||
|
"include": ["vite.config.ts"]
|
||||||
|
}
|
||||||
38
interactive/frontend/vite.config.ts
Normal file
38
interactive/frontend/vite.config.ts
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
import vue from "@vitejs/plugin-vue";
|
||||||
|
import AutoImport from "unplugin-auto-import/vite";
|
||||||
|
import { NaiveUiResolver } from "unplugin-vue-components/resolvers";
|
||||||
|
import Components from "unplugin-vue-components/vite";
|
||||||
|
// vite.config.ts
|
||||||
|
import { defineConfig } from "vite";
|
||||||
|
|
||||||
|
// https://vitejs.dev/config/
|
||||||
|
export default defineConfig({
|
||||||
|
server: {
|
||||||
|
watch: {
|
||||||
|
// 使用轮询模式,避免文件描述符问题
|
||||||
|
usePolling: true,
|
||||||
|
interval: 1000,
|
||||||
|
// 忽略不需要监视的目录
|
||||||
|
ignored: ["**/node_modules/**", "**/.git/**", "**/.next/**"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
plugins: [
|
||||||
|
vue(),
|
||||||
|
AutoImport({
|
||||||
|
imports: [
|
||||||
|
"vue",
|
||||||
|
{
|
||||||
|
"naive-ui": [
|
||||||
|
"useDialog",
|
||||||
|
"useMessage",
|
||||||
|
"useNotification",
|
||||||
|
"useLoadingBar",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}),
|
||||||
|
Components({
|
||||||
|
resolvers: [NaiveUiResolver()],
|
||||||
|
}),
|
||||||
|
],
|
||||||
|
});
|
||||||
133
interactive/main.py
Normal file
133
interactive/main.py
Normal file
@@ -0,0 +1,133 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import os
|
||||||
|
from peewee import SQL
|
||||||
|
from flask import Flask, jsonify, request, abort, send_from_directory
|
||||||
|
from flask_cors import CORS
|
||||||
|
from db import insert, index, Pridict2, PrimeDesign, table_columns
|
||||||
|
|
||||||
|
|
||||||
|
app = Flask(__name__, static_folder="./frontend/dist")
|
||||||
|
CORS(app)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/<path:filename>")
|
||||||
|
def static_files(filename):
|
||||||
|
"""专门处理带扩展名的文件"""
|
||||||
|
if "." not in filename:
|
||||||
|
abort(404) # 无扩展名不应走这里
|
||||||
|
try:
|
||||||
|
return send_from_directory(app.static_folder, filename)
|
||||||
|
except FileNotFoundError:
|
||||||
|
abort(404) # 静态文件不存在就是 404
|
||||||
|
|
||||||
|
@app.route("/", defaults={"path": ""})
|
||||||
|
@app.route("/<path:path>")
|
||||||
|
def main(path):
|
||||||
|
"""仅处理 SPA 路由(无扩展名)"""
|
||||||
|
if "." in os.path.basename(path):
|
||||||
|
# 包含扩展名?说明应该是静态文件,但没被上面的路由捕获 → 404
|
||||||
|
abort(404)
|
||||||
|
return send_from_directory(app.static_folder, "index.html")
|
||||||
|
|
||||||
|
|
||||||
|
def default_value(val, default):
|
||||||
|
try:
|
||||||
|
return int(val)
|
||||||
|
except Exception:
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/gene")
|
||||||
|
def gene():
|
||||||
|
genes = set()
|
||||||
|
|
||||||
|
source = request.args.get("source", "pridict2")
|
||||||
|
|
||||||
|
tables = {
|
||||||
|
"pridict2": Pridict2,
|
||||||
|
"prime_design": PrimeDesign,
|
||||||
|
}
|
||||||
|
|
||||||
|
table = tables.get(source)
|
||||||
|
if not table:
|
||||||
|
return jsonify({"message": "No such table"}), 404
|
||||||
|
|
||||||
|
for i in table.select(table.gene.distinct()):
|
||||||
|
genes.add(i.gene)
|
||||||
|
|
||||||
|
return jsonify(sorted(genes))
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/records")
|
||||||
|
def records():
|
||||||
|
source = request.args.get("source", "pridict2")
|
||||||
|
|
||||||
|
tables = {
|
||||||
|
"pridict2": Pridict2,
|
||||||
|
"prime_design": PrimeDesign,
|
||||||
|
}
|
||||||
|
|
||||||
|
table = tables.get(source)
|
||||||
|
if not table:
|
||||||
|
return jsonify({"message": "No such table"}), 404
|
||||||
|
|
||||||
|
columns = table_columns(table)
|
||||||
|
where = None
|
||||||
|
for i in ["gene", "dst", "src"]:
|
||||||
|
value = request.args.get(i)
|
||||||
|
if value:
|
||||||
|
if where is None:
|
||||||
|
where = (SQL(i) == value)
|
||||||
|
else:
|
||||||
|
where = (where) & (SQL(i) == value)
|
||||||
|
|
||||||
|
for i in ["pbs_len", "rtt_len"]:
|
||||||
|
value = default_value(request.args.get(i), 0)
|
||||||
|
if value:
|
||||||
|
if where is None:
|
||||||
|
where = (SQL(i) <= value)
|
||||||
|
else:
|
||||||
|
where = (where) & (SQL(i) <= value)
|
||||||
|
|
||||||
|
query = table.select().where(where)
|
||||||
|
total = query.count()
|
||||||
|
|
||||||
|
order_by = request.args.get("order_by")
|
||||||
|
if order_by and order_by in columns:
|
||||||
|
order = request.args.get("order", "asc")
|
||||||
|
if "desc" in order:
|
||||||
|
query = query.order_by(SQL(order_by).desc())
|
||||||
|
else:
|
||||||
|
query = query.order_by(SQL(order_by))
|
||||||
|
else:
|
||||||
|
query = query.order_by(table.gene, table.aa, table.src, table.dst)
|
||||||
|
|
||||||
|
offset = default_value(request.args.get("offset"), 1)
|
||||||
|
if offset <= 0:
|
||||||
|
offset = 1
|
||||||
|
|
||||||
|
length = default_value(request.args.get("length"), 1)
|
||||||
|
if length > 200:
|
||||||
|
length = 200
|
||||||
|
query = query.offset((int(offset) - 1) * length).limit(int(length))
|
||||||
|
print(query.sql())
|
||||||
|
return jsonify({
|
||||||
|
"data": [x for x in query.dicts()],
|
||||||
|
"total": total,
|
||||||
|
"offset": offset,
|
||||||
|
"length": length,
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def main(host: str="0.0.0.0", port=5555):
|
||||||
|
app.run(host=host, port=port, threaded=True, debug=True)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from fire import Fire
|
||||||
|
Fire({
|
||||||
|
"insert": insert,
|
||||||
|
"index": index,
|
||||||
|
"server": main
|
||||||
|
})
|
||||||
9
interactive/pyproject.toml
Normal file
9
interactive/pyproject.toml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
[project]
|
||||||
|
name = "interactive"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Add your description here"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.12"
|
||||||
|
dependencies = [
|
||||||
|
"peewee>=3.19.0",
|
||||||
|
]
|
||||||
53
merge_results.py
Normal file
53
merge_results.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import os
|
||||||
|
import gzip
|
||||||
|
import polars as pd
|
||||||
|
from glob import glob
|
||||||
|
from tqdm import tqdm
|
||||||
|
from multiprocessing import Pool
|
||||||
|
|
||||||
|
|
||||||
|
def read_file(args):
|
||||||
|
path, nicking = args
|
||||||
|
if os.path.getsize(path) < 1:
|
||||||
|
return None
|
||||||
|
|
||||||
|
for i in ["FANCD2", "BRIP1", "RAD51C", "FABCI", "FANCA"]:
|
||||||
|
if path.startswith(i):
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
df = pd.read_csv(path)
|
||||||
|
|
||||||
|
if nicking:
|
||||||
|
key = os.path.basename(path).split("_nicking")[0]
|
||||||
|
df = df.with_columns(sequence_name=pd.lit(key))
|
||||||
|
except Exception:
|
||||||
|
print(path)
|
||||||
|
return None
|
||||||
|
|
||||||
|
if "low_conf" not in path:
|
||||||
|
df = df.with_columns(conf=pd.lit("high"))
|
||||||
|
else:
|
||||||
|
df = df.with_columns(conf=pd.lit("low"))
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def main(indir, output, nicking=False):
|
||||||
|
print(indir, output, nicking)
|
||||||
|
|
||||||
|
fs = glob(os.path.join(indir, "*"))
|
||||||
|
|
||||||
|
with Pool(6) as p:
|
||||||
|
dfs = list(tqdm(p.imap(read_file, [[x, nicking] for x in fs]), total=len(fs)))
|
||||||
|
|
||||||
|
df = pd.concat([x for x in dfs if x is not None])
|
||||||
|
|
||||||
|
with gzip.open(output, "w+") as w:
|
||||||
|
df.write_csv(w)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from fire import Fire
|
||||||
|
Fire(main)
|
||||||
113
select_primedesign.py
Normal file
113
select_primedesign.py
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import csv
|
||||||
|
import gzip
|
||||||
|
import random
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
seed = 42
|
||||||
|
|
||||||
|
|
||||||
|
total_codons = {
|
||||||
|
"丙氨酸": ["GCT", "GCC", "GCA", "GCG"],
|
||||||
|
"精氨酸": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"],
|
||||||
|
"天冬酰胺": ["AAT", "AAC"],
|
||||||
|
"天冬氨酸": ["GAT", "GAC"],
|
||||||
|
"半胱氨酸": ["TGT", "TGC"],
|
||||||
|
"谷氨酰胺": ["CAA", "CAG"],
|
||||||
|
"谷氨酸": ["GAA", "GAG"],
|
||||||
|
"甘氨酸": ["GGT", "GGC", "GGA", "GGG"],
|
||||||
|
"组氨酸": ["CAT", "CAC"],
|
||||||
|
"异亮氨酸": ["ATT", "ATC", "ATA"],
|
||||||
|
"亮氨酸": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"],
|
||||||
|
"赖氨酸": ["AAA", "AAG"],
|
||||||
|
"甲硫氨酸": ["ATG"],
|
||||||
|
"苯丙氨酸": ["TTT", "TTC"],
|
||||||
|
"脯氨酸": ["CCT", "CCC", "CCA", "CCG"],
|
||||||
|
"丝氨酸": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"],
|
||||||
|
"苏氨酸": ["ACT", "ACC", "ACA", "ACG"],
|
||||||
|
"色氨酸": ["TGG"],
|
||||||
|
"酪氨酸": ["TAT", "TAC"],
|
||||||
|
"缬氨酸": ["GTT", "GTC", "GTA", "GTG"],
|
||||||
|
"终止密码子": ["TAA", "TAG", "TGA"]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load_finished(ref):
|
||||||
|
df = pd.read_excel(ref)
|
||||||
|
|
||||||
|
genes = {}
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
gene = row["gene"]
|
||||||
|
pos = row["aa_pos"]
|
||||||
|
|
||||||
|
if gene not in genes:
|
||||||
|
genes[gene] = set()
|
||||||
|
genes[gene].add(str(pos))
|
||||||
|
return genes
|
||||||
|
|
||||||
|
|
||||||
|
def reader(path):
|
||||||
|
with gzip.open(path, "rt") as r:
|
||||||
|
dict_reader = csv.DictReader(r)
|
||||||
|
|
||||||
|
for row in tqdm(dict_reader):
|
||||||
|
yield row
|
||||||
|
|
||||||
|
|
||||||
|
def filter_(args):
|
||||||
|
finished, path, output = args
|
||||||
|
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
for row in reader(path):
|
||||||
|
|
||||||
|
# 提取各种id
|
||||||
|
key = row['Target_name'].split("_")
|
||||||
|
gene = key[0]
|
||||||
|
pos = key[1].replace("AA", "")
|
||||||
|
dst = key[-1]
|
||||||
|
|
||||||
|
#
|
||||||
|
uid = f"{key}_{pos}_{dst}"
|
||||||
|
for k, codons in total_codons.items():
|
||||||
|
if dst in codons:
|
||||||
|
if gene in finished and pos not in finished[gene]:
|
||||||
|
row["dst"] = k
|
||||||
|
row["aa_pos"] = pos
|
||||||
|
row["gene"] = gene
|
||||||
|
|
||||||
|
if uid not in data:
|
||||||
|
data[uid] = []
|
||||||
|
data[uid].append(row)
|
||||||
|
|
||||||
|
dict_writer = None
|
||||||
|
with gzip.open(output, "wt+") as w:
|
||||||
|
for _, lines in tqdm(data.items()):
|
||||||
|
if len(lines) > 2:
|
||||||
|
random.seed(seed)
|
||||||
|
lines = random.sample(lines, 2)
|
||||||
|
|
||||||
|
if dict_writer is None:
|
||||||
|
for row in lines:
|
||||||
|
dict_writer = csv.DictWriter(w, fieldnames=row.keys())
|
||||||
|
dict_writer.writeheader()
|
||||||
|
break
|
||||||
|
|
||||||
|
# 写入数据行
|
||||||
|
dict_writer.writerows(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def main(ref, indir, outdir):
|
||||||
|
|
||||||
|
finished = load_finished(ref)
|
||||||
|
filter_([finished, indir, outdir])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from fire import Fire
|
||||||
|
Fire(main)
|
||||||
|
|
||||||
|
|
||||||
Reference in New Issue
Block a user