提交饱和编辑的相关设计,及检验代码

This commit is contained in:
2026-02-26 14:02:42 +08:00
commit cb556b47c0
36 changed files with 5437 additions and 0 deletions

222
interactive/db.py Normal file
View File

@@ -0,0 +1,222 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import peewee as pw
import re
import csv
import gzip
from typing import Dict
db = pw.SqliteDatabase("./pegrna.db")
class BaseModel(pw.Model):
class Meta:
database = db
KEY_MAP = {
"pridict2": {
"sequence_name": "sequence",
"EditedAllele": "dst",
"OriginalAllele": "src",
"PRIDICT2_0_editing_Score_deep_K562": "k562",
"PRIDICT2_0_editing_Score_deep_HEK": "hek",
"K562_rank": "k562_rank",
"HEK_rank": "hek_rank",
"PRIDICT2_Format": "template",
"Target-Strand": "strand",
"PBSlength": "pbs_len",
"RToverhanglength": "rtt_oh_len",
"RTlength": "rtt_len",
"Spacer-Sequence": "spacer",
"Scaffold_Optimized": "scaffold",
"pegRNA": "pegrna",
"PBSrevcomp": "pbs",
"RTseqoverhangrevcomp": "rtt_oh",
"RTrevcomp": "rtt",
},
"prime_design": {
"Target_name": "sequence",
# "": "dst",
# "": "src",
"Target_sequence": "template",
"Strand": "strand",
"PBS_length": "pbs_len",
"RTT_length": "rtt_len",
"Spacer_sequence": "spacer",
"PAM_sequence": "pam",
"Extension_sequence": "extension", # RTT + PBS
"Spacer_sequence_order_TOP": "before_spacer",
"Spacer_sequence_order_BOTTOM": "after_spacer",
"pegRNA_extension_sequence_order_TOP": "before_pegnra_ext",
"pegRNA_extension_sequence_order_BOTTOM": "after_pegnra_ext",
}
}
def bulk_insert(table, data, chunk = 100):
with db.atomic():
for i in range(0, len(data), chunk):
table.insert_many(data[i:i + chunk]).execute()
class Pridict2(BaseModel):
gene = pw.CharField()
aa = pw.IntegerField()
sequence = pw.CharField()
src = pw.CharField()
dst = pw.CharField()
k562 = pw.FloatField()
hek = pw.FloatField()
k562_rank = pw.IntegerField()
hek_rank = pw.IntegerField()
template = pw.CharField()
strand = pw.CharField()
pbs_len = pw.IntegerField()
rtt_oh_len = pw.IntegerField()
rtt_len = pw.IntegerField()
spacer = pw.CharField()
scaffold = pw.CharField()
pegrna = pw.CharField()
pbs = pw.CharField()
rtt_oh = pw.CharField()
rtt = pw.CharField()
class Meta:
table_name = "pridict2"
class PrimeDesign(BaseModel):
gene = pw.CharField()
aa = pw.IntegerField()
sequence = pw.CharField()
src = pw.CharField()
dst = pw.CharField()
template = pw.CharField()
strand = pw.CharField()
pbs_len = pw.IntegerField()
rtt_len = pw.IntegerField()
pam = pw.CharField()
spacer = pw.CharField()
extension = pw.CharField()
pbs = pw.CharField()
rtt = pw.CharField()
before_spacer = pw.CharField()
after_spacer = pw.CharField()
before_pegnra_ext = pw.CharField()
after_pegnra_ext= pw.CharField()
class Meta:
table_name = "prime_design"
def format_data(value: Dict, mapping: Dict[str, str]) -> Dict[str, any]:
res = {}
for key, value in value.items():
if key in mapping.keys():
res[mapping[key]] = value
if not res.get("src"):
res["src"] = res["sequence"].split("_")[-2]
res["dst"] = res["sequence"].split("_")[-1]
res["aa"] = int(re.sub(r"\D", "", res["sequence"].split("_")[1]))
res["gene"] = res["sequence"].split("_")[0]
if not res.get("pbs") and res.get("extension") and res.get("pbs_len") and res.get("rtt_len"):
if len(res["extension"]) == int(res["pbs_len"]) + int(res["rtt_len"]):
res["pbs"] = res["extension"][:int(res["pbs_len"])]
res["rtt"] = res["extension"][int(res["pbs_len"]):]
return res
def insert(path: str, kind: str = "PRIDICT2", chunk: int = 10000):
if not Pridict2.table_exists():
Pridict2.create_table()
if not PrimeDesign.table_exists():
PrimeDesign.create_table()
kind = kind.lower()
assert kind in KEY_MAP.keys()
data = []
rows = 0
with gzip.open(path, 'rt', encoding='utf-8') as file:
csv_dict_reader = csv.DictReader(file)
# 逐行读取,每行是一个字典
for row in csv_dict_reader:
# 通过列名访问数据
data.append(format_data(row, KEY_MAP[kind]))
rows += 1
if len(data) >= chunk:
print(f"finished {rows} rows")
bulk_insert(Pridict2 if kind == "pridict2" else PrimeDesign, data)
data = []
if data:
bulk_insert(Pridict2 if kind == "pridict2" else PrimeDesign, data)
def index():
# 创建简单索引
for i in [
Pridict2.gene,
Pridict2.aa,
Pridict2.sequence,
Pridict2.dst,
Pridict2.src,
Pridict2.k562,
Pridict2.hek,
Pridict2.pbs_len,
Pridict2.rtt_len,
]:
print(Pridict2.__name__, i.name)
sql = f"CREATE INDEX IF NOT EXISTS {Pridict2.__name__}_{i.name}_idx ON pridict2 ({i.name});"
db.execute_sql(sql)
for i in [
PrimeDesign.gene,
PrimeDesign.aa,
PrimeDesign.sequence,
PrimeDesign.dst,
PrimeDesign.src,
PrimeDesign.pbs_len,
PrimeDesign.rtt_len,
]:
print(PrimeDesign.__name__, i.name)
sql = f"CREATE INDEX IF NOT EXISTS {PrimeDesign.__name__}_{i.name}_idx ON prime_design ({i.name});"
db.execute_sql(sql)
def table_columns(table):
return {x: y for x, y in table.__dict__.items() if "__" not in x}
if __name__ == "__main__":
print(table_columns(Pridict2))
pass