提交饱和编辑的相关设计,及检验代码
This commit is contained in:
113
select_primedesign.py
Normal file
113
select_primedesign.py
Normal file
@@ -0,0 +1,113 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
import csv
|
||||
import gzip
|
||||
import random
|
||||
import pandas as pd
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
seed = 42
|
||||
|
||||
|
||||
total_codons = {
|
||||
"丙氨酸": ["GCT", "GCC", "GCA", "GCG"],
|
||||
"精氨酸": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"],
|
||||
"天冬酰胺": ["AAT", "AAC"],
|
||||
"天冬氨酸": ["GAT", "GAC"],
|
||||
"半胱氨酸": ["TGT", "TGC"],
|
||||
"谷氨酰胺": ["CAA", "CAG"],
|
||||
"谷氨酸": ["GAA", "GAG"],
|
||||
"甘氨酸": ["GGT", "GGC", "GGA", "GGG"],
|
||||
"组氨酸": ["CAT", "CAC"],
|
||||
"异亮氨酸": ["ATT", "ATC", "ATA"],
|
||||
"亮氨酸": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"],
|
||||
"赖氨酸": ["AAA", "AAG"],
|
||||
"甲硫氨酸": ["ATG"],
|
||||
"苯丙氨酸": ["TTT", "TTC"],
|
||||
"脯氨酸": ["CCT", "CCC", "CCA", "CCG"],
|
||||
"丝氨酸": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"],
|
||||
"苏氨酸": ["ACT", "ACC", "ACA", "ACG"],
|
||||
"色氨酸": ["TGG"],
|
||||
"酪氨酸": ["TAT", "TAC"],
|
||||
"缬氨酸": ["GTT", "GTC", "GTA", "GTG"],
|
||||
"终止密码子": ["TAA", "TAG", "TGA"]
|
||||
}
|
||||
|
||||
|
||||
def load_finished(ref):
|
||||
df = pd.read_excel(ref)
|
||||
|
||||
genes = {}
|
||||
for _, row in df.iterrows():
|
||||
gene = row["gene"]
|
||||
pos = row["aa_pos"]
|
||||
|
||||
if gene not in genes:
|
||||
genes[gene] = set()
|
||||
genes[gene].add(str(pos))
|
||||
return genes
|
||||
|
||||
|
||||
def reader(path):
|
||||
with gzip.open(path, "rt") as r:
|
||||
dict_reader = csv.DictReader(r)
|
||||
|
||||
for row in tqdm(dict_reader):
|
||||
yield row
|
||||
|
||||
|
||||
def filter_(args):
|
||||
finished, path, output = args
|
||||
|
||||
data = {}
|
||||
|
||||
for row in reader(path):
|
||||
|
||||
# 提取各种id
|
||||
key = row['Target_name'].split("_")
|
||||
gene = key[0]
|
||||
pos = key[1].replace("AA", "")
|
||||
dst = key[-1]
|
||||
|
||||
#
|
||||
uid = f"{key}_{pos}_{dst}"
|
||||
for k, codons in total_codons.items():
|
||||
if dst in codons:
|
||||
if gene in finished and pos not in finished[gene]:
|
||||
row["dst"] = k
|
||||
row["aa_pos"] = pos
|
||||
row["gene"] = gene
|
||||
|
||||
if uid not in data:
|
||||
data[uid] = []
|
||||
data[uid].append(row)
|
||||
|
||||
dict_writer = None
|
||||
with gzip.open(output, "wt+") as w:
|
||||
for _, lines in tqdm(data.items()):
|
||||
if len(lines) > 2:
|
||||
random.seed(seed)
|
||||
lines = random.sample(lines, 2)
|
||||
|
||||
if dict_writer is None:
|
||||
for row in lines:
|
||||
dict_writer = csv.DictWriter(w, fieldnames=row.keys())
|
||||
dict_writer.writeheader()
|
||||
break
|
||||
|
||||
# 写入数据行
|
||||
dict_writer.writerows(lines)
|
||||
|
||||
|
||||
def main(ref, indir, outdir):
|
||||
|
||||
finished = load_finished(ref)
|
||||
filter_([finished, indir, outdir])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from fire import Fire
|
||||
Fire(main)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user