#!/usr/bin/env python3 # -*- coding: utf-8 -*- import peewee as pw import re import csv import gzip from typing import Dict db = pw.SqliteDatabase("./pegrna.db") class BaseModel(pw.Model): class Meta: database = db KEY_MAP = { "pridict2": { "sequence_name": "sequence", "EditedAllele": "dst", "OriginalAllele": "src", "PRIDICT2_0_editing_Score_deep_K562": "k562", "PRIDICT2_0_editing_Score_deep_HEK": "hek", "K562_rank": "k562_rank", "HEK_rank": "hek_rank", "PRIDICT2_Format": "template", "Target-Strand": "strand", "PBSlength": "pbs_len", "RToverhanglength": "rtt_oh_len", "RTlength": "rtt_len", "Spacer-Sequence": "spacer", "Scaffold_Optimized": "scaffold", "pegRNA": "pegrna", "PBSrevcomp": "pbs", "RTseqoverhangrevcomp": "rtt_oh", "RTrevcomp": "rtt", }, "prime_design": { "Target_name": "sequence", # "": "dst", # "": "src", "Target_sequence": "template", "Strand": "strand", "PBS_length": "pbs_len", "RTT_length": "rtt_len", "Spacer_sequence": "spacer", "PAM_sequence": "pam", "Extension_sequence": "extension", # RTT + PBS "Spacer_sequence_order_TOP": "before_spacer", "Spacer_sequence_order_BOTTOM": "after_spacer", "pegRNA_extension_sequence_order_TOP": "before_pegnra_ext", "pegRNA_extension_sequence_order_BOTTOM": "after_pegnra_ext", } } def bulk_insert(table, data, chunk = 100): with db.atomic(): for i in range(0, len(data), chunk): table.insert_many(data[i:i + chunk]).execute() class Pridict2(BaseModel): gene = pw.CharField() aa = pw.IntegerField() sequence = pw.CharField() src = pw.CharField() dst = pw.CharField() k562 = pw.FloatField() hek = pw.FloatField() k562_rank = pw.IntegerField() hek_rank = pw.IntegerField() template = pw.CharField() strand = pw.CharField() pbs_len = pw.IntegerField() rtt_oh_len = pw.IntegerField() rtt_len = pw.IntegerField() spacer = pw.CharField() scaffold = pw.CharField() pegrna = pw.CharField() pbs = pw.CharField() rtt_oh = pw.CharField() rtt = pw.CharField() class Meta: table_name = "pridict2" class PrimeDesign(BaseModel): gene = pw.CharField() aa = pw.IntegerField() sequence = pw.CharField() src = pw.CharField() dst = pw.CharField() template = pw.CharField() strand = pw.CharField() pbs_len = pw.IntegerField() rtt_len = pw.IntegerField() pam = pw.CharField() spacer = pw.CharField() extension = pw.CharField() pbs = pw.CharField() rtt = pw.CharField() before_spacer = pw.CharField() after_spacer = pw.CharField() before_pegnra_ext = pw.CharField() after_pegnra_ext= pw.CharField() class Meta: table_name = "prime_design" def format_data(value: Dict, mapping: Dict[str, str]) -> Dict[str, any]: res = {} for key, value in value.items(): if key in mapping.keys(): res[mapping[key]] = value if not res.get("src"): res["src"] = res["sequence"].split("_")[-2] res["dst"] = res["sequence"].split("_")[-1] res["aa"] = int(re.sub(r"\D", "", res["sequence"].split("_")[1])) res["gene"] = res["sequence"].split("_")[0] if not res.get("pbs") and res.get("extension") and res.get("pbs_len") and res.get("rtt_len"): if len(res["extension"]) == int(res["pbs_len"]) + int(res["rtt_len"]): res["pbs"] = res["extension"][:int(res["pbs_len"])] res["rtt"] = res["extension"][int(res["pbs_len"]):] return res def insert(path: str, kind: str = "PRIDICT2", chunk: int = 10000): if not Pridict2.table_exists(): Pridict2.create_table() if not PrimeDesign.table_exists(): PrimeDesign.create_table() kind = kind.lower() assert kind in KEY_MAP.keys() data = [] rows = 0 with gzip.open(path, 'rt', encoding='utf-8') as file: csv_dict_reader = csv.DictReader(file) # 逐行读取,每行是一个字典 for row in csv_dict_reader: # 通过列名访问数据 data.append(format_data(row, KEY_MAP[kind])) rows += 1 if len(data) >= chunk: print(f"finished {rows} rows") bulk_insert(Pridict2 if kind == "pridict2" else PrimeDesign, data) data = [] if data: bulk_insert(Pridict2 if kind == "pridict2" else PrimeDesign, data) def index(): # 创建简单索引 for i in [ Pridict2.gene, Pridict2.aa, Pridict2.sequence, Pridict2.dst, Pridict2.src, Pridict2.k562, Pridict2.hek, Pridict2.pbs_len, Pridict2.rtt_len, ]: print(Pridict2.__name__, i.name) sql = f"CREATE INDEX IF NOT EXISTS {Pridict2.__name__}_{i.name}_idx ON pridict2 ({i.name});" db.execute_sql(sql) for i in [ PrimeDesign.gene, PrimeDesign.aa, PrimeDesign.sequence, PrimeDesign.dst, PrimeDesign.src, PrimeDesign.pbs_len, PrimeDesign.rtt_len, ]: print(PrimeDesign.__name__, i.name) sql = f"CREATE INDEX IF NOT EXISTS {PrimeDesign.__name__}_{i.name}_idx ON prime_design ({i.name});" db.execute_sql(sql) def table_columns(table): return {x: y for x, y in table.__dict__.items() if "__" not in x} if __name__ == "__main__": print(table_columns(Pridict2)) pass