223 lines
5.5 KiB
Python
223 lines
5.5 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
import peewee as pw
|
|
import re
|
|
import csv
|
|
import gzip
|
|
|
|
from typing import Dict
|
|
|
|
|
|
db = pw.SqliteDatabase("./pegrna.db")
|
|
|
|
|
|
class BaseModel(pw.Model):
|
|
|
|
class Meta:
|
|
database = db
|
|
|
|
|
|
KEY_MAP = {
|
|
"pridict2": {
|
|
"sequence_name": "sequence",
|
|
"EditedAllele": "dst",
|
|
"OriginalAllele": "src",
|
|
"PRIDICT2_0_editing_Score_deep_K562": "k562",
|
|
"PRIDICT2_0_editing_Score_deep_HEK": "hek",
|
|
"K562_rank": "k562_rank",
|
|
"HEK_rank": "hek_rank",
|
|
"PRIDICT2_Format": "template",
|
|
"Target-Strand": "strand",
|
|
"PBSlength": "pbs_len",
|
|
"RToverhanglength": "rtt_oh_len",
|
|
"RTlength": "rtt_len",
|
|
"Spacer-Sequence": "spacer",
|
|
"Scaffold_Optimized": "scaffold",
|
|
"pegRNA": "pegrna",
|
|
"PBSrevcomp": "pbs",
|
|
"RTseqoverhangrevcomp": "rtt_oh",
|
|
"RTrevcomp": "rtt",
|
|
},
|
|
"prime_design": {
|
|
"Target_name": "sequence",
|
|
# "": "dst",
|
|
# "": "src",
|
|
"Target_sequence": "template",
|
|
"Strand": "strand",
|
|
"PBS_length": "pbs_len",
|
|
"RTT_length": "rtt_len",
|
|
"Spacer_sequence": "spacer",
|
|
"PAM_sequence": "pam",
|
|
"Extension_sequence": "extension", # RTT + PBS
|
|
"Spacer_sequence_order_TOP": "before_spacer",
|
|
"Spacer_sequence_order_BOTTOM": "after_spacer",
|
|
"pegRNA_extension_sequence_order_TOP": "before_pegnra_ext",
|
|
"pegRNA_extension_sequence_order_BOTTOM": "after_pegnra_ext",
|
|
}
|
|
}
|
|
|
|
|
|
def bulk_insert(table, data, chunk = 100):
|
|
with db.atomic():
|
|
for i in range(0, len(data), chunk):
|
|
table.insert_many(data[i:i + chunk]).execute()
|
|
|
|
|
|
class Pridict2(BaseModel):
|
|
gene = pw.CharField()
|
|
aa = pw.IntegerField()
|
|
|
|
sequence = pw.CharField()
|
|
|
|
src = pw.CharField()
|
|
dst = pw.CharField()
|
|
|
|
k562 = pw.FloatField()
|
|
hek = pw.FloatField()
|
|
|
|
k562_rank = pw.IntegerField()
|
|
hek_rank = pw.IntegerField()
|
|
|
|
template = pw.CharField()
|
|
strand = pw.CharField()
|
|
|
|
pbs_len = pw.IntegerField()
|
|
rtt_oh_len = pw.IntegerField()
|
|
rtt_len = pw.IntegerField()
|
|
|
|
spacer = pw.CharField()
|
|
scaffold = pw.CharField()
|
|
pegrna = pw.CharField()
|
|
pbs = pw.CharField()
|
|
rtt_oh = pw.CharField()
|
|
rtt = pw.CharField()
|
|
|
|
class Meta:
|
|
table_name = "pridict2"
|
|
|
|
|
|
class PrimeDesign(BaseModel):
|
|
gene = pw.CharField()
|
|
aa = pw.IntegerField()
|
|
sequence = pw.CharField()
|
|
src = pw.CharField()
|
|
dst = pw.CharField()
|
|
|
|
template = pw.CharField()
|
|
strand = pw.CharField()
|
|
|
|
pbs_len = pw.IntegerField()
|
|
rtt_len = pw.IntegerField()
|
|
|
|
pam = pw.CharField()
|
|
spacer = pw.CharField()
|
|
extension = pw.CharField()
|
|
pbs = pw.CharField()
|
|
rtt = pw.CharField()
|
|
before_spacer = pw.CharField()
|
|
after_spacer = pw.CharField()
|
|
|
|
before_pegnra_ext = pw.CharField()
|
|
after_pegnra_ext= pw.CharField()
|
|
|
|
class Meta:
|
|
table_name = "prime_design"
|
|
|
|
|
|
def format_data(value: Dict, mapping: Dict[str, str]) -> Dict[str, any]:
|
|
res = {}
|
|
for key, value in value.items():
|
|
if key in mapping.keys():
|
|
res[mapping[key]] = value
|
|
|
|
if not res.get("src"):
|
|
res["src"] = res["sequence"].split("_")[-2]
|
|
res["dst"] = res["sequence"].split("_")[-1]
|
|
|
|
res["aa"] = int(re.sub(r"\D", "", res["sequence"].split("_")[1]))
|
|
res["gene"] = res["sequence"].split("_")[0]
|
|
|
|
if not res.get("pbs") and res.get("extension") and res.get("pbs_len") and res.get("rtt_len"):
|
|
if len(res["extension"]) == int(res["pbs_len"]) + int(res["rtt_len"]):
|
|
res["pbs"] = res["extension"][:int(res["pbs_len"])]
|
|
res["rtt"] = res["extension"][int(res["pbs_len"]):]
|
|
return res
|
|
|
|
|
|
def insert(path: str, kind: str = "PRIDICT2", chunk: int = 10000):
|
|
|
|
if not Pridict2.table_exists():
|
|
Pridict2.create_table()
|
|
|
|
if not PrimeDesign.table_exists():
|
|
PrimeDesign.create_table()
|
|
|
|
|
|
kind = kind.lower()
|
|
|
|
assert kind in KEY_MAP.keys()
|
|
|
|
data = []
|
|
rows = 0
|
|
with gzip.open(path, 'rt', encoding='utf-8') as file:
|
|
csv_dict_reader = csv.DictReader(file)
|
|
|
|
# 逐行读取,每行是一个字典
|
|
for row in csv_dict_reader:
|
|
# 通过列名访问数据
|
|
data.append(format_data(row, KEY_MAP[kind]))
|
|
rows += 1
|
|
if len(data) >= chunk:
|
|
print(f"finished {rows} rows")
|
|
bulk_insert(Pridict2 if kind == "pridict2" else PrimeDesign, data)
|
|
data = []
|
|
|
|
if data:
|
|
bulk_insert(Pridict2 if kind == "pridict2" else PrimeDesign, data)
|
|
|
|
|
|
def index():
|
|
# 创建简单索引
|
|
|
|
for i in [
|
|
Pridict2.gene,
|
|
Pridict2.aa,
|
|
Pridict2.sequence,
|
|
Pridict2.dst,
|
|
Pridict2.src,
|
|
|
|
Pridict2.k562,
|
|
Pridict2.hek,
|
|
|
|
Pridict2.pbs_len,
|
|
Pridict2.rtt_len,
|
|
]:
|
|
print(Pridict2.__name__, i.name)
|
|
sql = f"CREATE INDEX IF NOT EXISTS {Pridict2.__name__}_{i.name}_idx ON pridict2 ({i.name});"
|
|
db.execute_sql(sql)
|
|
|
|
|
|
for i in [
|
|
PrimeDesign.gene,
|
|
PrimeDesign.aa,
|
|
PrimeDesign.sequence,
|
|
PrimeDesign.dst,
|
|
PrimeDesign.src,
|
|
|
|
PrimeDesign.pbs_len,
|
|
PrimeDesign.rtt_len,
|
|
]:
|
|
print(PrimeDesign.__name__, i.name)
|
|
sql = f"CREATE INDEX IF NOT EXISTS {PrimeDesign.__name__}_{i.name}_idx ON prime_design ({i.name});"
|
|
db.execute_sql(sql)
|
|
|
|
|
|
def table_columns(table):
|
|
return {x: y for x, y in table.__dict__.items() if "__" not in x}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print(table_columns(Pridict2))
|
|
pass
|
|
|