提交饱和编辑的相关设计,及检验代码
This commit is contained in:
222
interactive/db.py
Normal file
222
interactive/db.py
Normal file
@@ -0,0 +1,222 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
import peewee as pw
|
||||
import re
|
||||
import csv
|
||||
import gzip
|
||||
|
||||
from typing import Dict
|
||||
|
||||
|
||||
db = pw.SqliteDatabase("./pegrna.db")
|
||||
|
||||
|
||||
class BaseModel(pw.Model):
|
||||
|
||||
class Meta:
|
||||
database = db
|
||||
|
||||
|
||||
KEY_MAP = {
|
||||
"pridict2": {
|
||||
"sequence_name": "sequence",
|
||||
"EditedAllele": "dst",
|
||||
"OriginalAllele": "src",
|
||||
"PRIDICT2_0_editing_Score_deep_K562": "k562",
|
||||
"PRIDICT2_0_editing_Score_deep_HEK": "hek",
|
||||
"K562_rank": "k562_rank",
|
||||
"HEK_rank": "hek_rank",
|
||||
"PRIDICT2_Format": "template",
|
||||
"Target-Strand": "strand",
|
||||
"PBSlength": "pbs_len",
|
||||
"RToverhanglength": "rtt_oh_len",
|
||||
"RTlength": "rtt_len",
|
||||
"Spacer-Sequence": "spacer",
|
||||
"Scaffold_Optimized": "scaffold",
|
||||
"pegRNA": "pegrna",
|
||||
"PBSrevcomp": "pbs",
|
||||
"RTseqoverhangrevcomp": "rtt_oh",
|
||||
"RTrevcomp": "rtt",
|
||||
},
|
||||
"prime_design": {
|
||||
"Target_name": "sequence",
|
||||
# "": "dst",
|
||||
# "": "src",
|
||||
"Target_sequence": "template",
|
||||
"Strand": "strand",
|
||||
"PBS_length": "pbs_len",
|
||||
"RTT_length": "rtt_len",
|
||||
"Spacer_sequence": "spacer",
|
||||
"PAM_sequence": "pam",
|
||||
"Extension_sequence": "extension", # RTT + PBS
|
||||
"Spacer_sequence_order_TOP": "before_spacer",
|
||||
"Spacer_sequence_order_BOTTOM": "after_spacer",
|
||||
"pegRNA_extension_sequence_order_TOP": "before_pegnra_ext",
|
||||
"pegRNA_extension_sequence_order_BOTTOM": "after_pegnra_ext",
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def bulk_insert(table, data, chunk = 100):
|
||||
with db.atomic():
|
||||
for i in range(0, len(data), chunk):
|
||||
table.insert_many(data[i:i + chunk]).execute()
|
||||
|
||||
|
||||
class Pridict2(BaseModel):
|
||||
gene = pw.CharField()
|
||||
aa = pw.IntegerField()
|
||||
|
||||
sequence = pw.CharField()
|
||||
|
||||
src = pw.CharField()
|
||||
dst = pw.CharField()
|
||||
|
||||
k562 = pw.FloatField()
|
||||
hek = pw.FloatField()
|
||||
|
||||
k562_rank = pw.IntegerField()
|
||||
hek_rank = pw.IntegerField()
|
||||
|
||||
template = pw.CharField()
|
||||
strand = pw.CharField()
|
||||
|
||||
pbs_len = pw.IntegerField()
|
||||
rtt_oh_len = pw.IntegerField()
|
||||
rtt_len = pw.IntegerField()
|
||||
|
||||
spacer = pw.CharField()
|
||||
scaffold = pw.CharField()
|
||||
pegrna = pw.CharField()
|
||||
pbs = pw.CharField()
|
||||
rtt_oh = pw.CharField()
|
||||
rtt = pw.CharField()
|
||||
|
||||
class Meta:
|
||||
table_name = "pridict2"
|
||||
|
||||
|
||||
class PrimeDesign(BaseModel):
|
||||
gene = pw.CharField()
|
||||
aa = pw.IntegerField()
|
||||
sequence = pw.CharField()
|
||||
src = pw.CharField()
|
||||
dst = pw.CharField()
|
||||
|
||||
template = pw.CharField()
|
||||
strand = pw.CharField()
|
||||
|
||||
pbs_len = pw.IntegerField()
|
||||
rtt_len = pw.IntegerField()
|
||||
|
||||
pam = pw.CharField()
|
||||
spacer = pw.CharField()
|
||||
extension = pw.CharField()
|
||||
pbs = pw.CharField()
|
||||
rtt = pw.CharField()
|
||||
before_spacer = pw.CharField()
|
||||
after_spacer = pw.CharField()
|
||||
|
||||
before_pegnra_ext = pw.CharField()
|
||||
after_pegnra_ext= pw.CharField()
|
||||
|
||||
class Meta:
|
||||
table_name = "prime_design"
|
||||
|
||||
|
||||
def format_data(value: Dict, mapping: Dict[str, str]) -> Dict[str, any]:
|
||||
res = {}
|
||||
for key, value in value.items():
|
||||
if key in mapping.keys():
|
||||
res[mapping[key]] = value
|
||||
|
||||
if not res.get("src"):
|
||||
res["src"] = res["sequence"].split("_")[-2]
|
||||
res["dst"] = res["sequence"].split("_")[-1]
|
||||
|
||||
res["aa"] = int(re.sub(r"\D", "", res["sequence"].split("_")[1]))
|
||||
res["gene"] = res["sequence"].split("_")[0]
|
||||
|
||||
if not res.get("pbs") and res.get("extension") and res.get("pbs_len") and res.get("rtt_len"):
|
||||
if len(res["extension"]) == int(res["pbs_len"]) + int(res["rtt_len"]):
|
||||
res["pbs"] = res["extension"][:int(res["pbs_len"])]
|
||||
res["rtt"] = res["extension"][int(res["pbs_len"]):]
|
||||
return res
|
||||
|
||||
|
||||
def insert(path: str, kind: str = "PRIDICT2", chunk: int = 10000):
|
||||
|
||||
if not Pridict2.table_exists():
|
||||
Pridict2.create_table()
|
||||
|
||||
if not PrimeDesign.table_exists():
|
||||
PrimeDesign.create_table()
|
||||
|
||||
|
||||
kind = kind.lower()
|
||||
|
||||
assert kind in KEY_MAP.keys()
|
||||
|
||||
data = []
|
||||
rows = 0
|
||||
with gzip.open(path, 'rt', encoding='utf-8') as file:
|
||||
csv_dict_reader = csv.DictReader(file)
|
||||
|
||||
# 逐行读取,每行是一个字典
|
||||
for row in csv_dict_reader:
|
||||
# 通过列名访问数据
|
||||
data.append(format_data(row, KEY_MAP[kind]))
|
||||
rows += 1
|
||||
if len(data) >= chunk:
|
||||
print(f"finished {rows} rows")
|
||||
bulk_insert(Pridict2 if kind == "pridict2" else PrimeDesign, data)
|
||||
data = []
|
||||
|
||||
if data:
|
||||
bulk_insert(Pridict2 if kind == "pridict2" else PrimeDesign, data)
|
||||
|
||||
|
||||
def index():
|
||||
# 创建简单索引
|
||||
|
||||
for i in [
|
||||
Pridict2.gene,
|
||||
Pridict2.aa,
|
||||
Pridict2.sequence,
|
||||
Pridict2.dst,
|
||||
Pridict2.src,
|
||||
|
||||
Pridict2.k562,
|
||||
Pridict2.hek,
|
||||
|
||||
Pridict2.pbs_len,
|
||||
Pridict2.rtt_len,
|
||||
]:
|
||||
print(Pridict2.__name__, i.name)
|
||||
sql = f"CREATE INDEX IF NOT EXISTS {Pridict2.__name__}_{i.name}_idx ON pridict2 ({i.name});"
|
||||
db.execute_sql(sql)
|
||||
|
||||
|
||||
for i in [
|
||||
PrimeDesign.gene,
|
||||
PrimeDesign.aa,
|
||||
PrimeDesign.sequence,
|
||||
PrimeDesign.dst,
|
||||
PrimeDesign.src,
|
||||
|
||||
PrimeDesign.pbs_len,
|
||||
PrimeDesign.rtt_len,
|
||||
]:
|
||||
print(PrimeDesign.__name__, i.name)
|
||||
sql = f"CREATE INDEX IF NOT EXISTS {PrimeDesign.__name__}_{i.name}_idx ON prime_design ({i.name});"
|
||||
db.execute_sql(sql)
|
||||
|
||||
|
||||
def table_columns(table):
|
||||
return {x: y for x, y in table.__dict__.items() if "__" not in x}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(table_columns(Pridict2))
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user