提交饱和编辑的相关设计,及检验代码

This commit is contained in:
2026-02-26 14:02:42 +08:00
commit cb556b47c0
36 changed files with 5437 additions and 0 deletions

113
select_primedesign.py Normal file
View File

@@ -0,0 +1,113 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import csv
import gzip
import random
import pandas as pd
from tqdm import tqdm
seed = 42
total_codons = {
"丙氨酸": ["GCT", "GCC", "GCA", "GCG"],
"精氨酸": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"],
"天冬酰胺": ["AAT", "AAC"],
"天冬氨酸": ["GAT", "GAC"],
"半胱氨酸": ["TGT", "TGC"],
"谷氨酰胺": ["CAA", "CAG"],
"谷氨酸": ["GAA", "GAG"],
"甘氨酸": ["GGT", "GGC", "GGA", "GGG"],
"组氨酸": ["CAT", "CAC"],
"异亮氨酸": ["ATT", "ATC", "ATA"],
"亮氨酸": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"],
"赖氨酸": ["AAA", "AAG"],
"甲硫氨酸": ["ATG"],
"苯丙氨酸": ["TTT", "TTC"],
"脯氨酸": ["CCT", "CCC", "CCA", "CCG"],
"丝氨酸": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"],
"苏氨酸": ["ACT", "ACC", "ACA", "ACG"],
"色氨酸": ["TGG"],
"酪氨酸": ["TAT", "TAC"],
"缬氨酸": ["GTT", "GTC", "GTA", "GTG"],
"终止密码子": ["TAA", "TAG", "TGA"]
}
def load_finished(ref):
df = pd.read_excel(ref)
genes = {}
for _, row in df.iterrows():
gene = row["gene"]
pos = row["aa_pos"]
if gene not in genes:
genes[gene] = set()
genes[gene].add(str(pos))
return genes
def reader(path):
with gzip.open(path, "rt") as r:
dict_reader = csv.DictReader(r)
for row in tqdm(dict_reader):
yield row
def filter_(args):
finished, path, output = args
data = {}
for row in reader(path):
# 提取各种id
key = row['Target_name'].split("_")
gene = key[0]
pos = key[1].replace("AA", "")
dst = key[-1]
#
uid = f"{key}_{pos}_{dst}"
for k, codons in total_codons.items():
if dst in codons:
if gene in finished and pos not in finished[gene]:
row["dst"] = k
row["aa_pos"] = pos
row["gene"] = gene
if uid not in data:
data[uid] = []
data[uid].append(row)
dict_writer = None
with gzip.open(output, "wt+") as w:
for _, lines in tqdm(data.items()):
if len(lines) > 2:
random.seed(seed)
lines = random.sample(lines, 2)
if dict_writer is None:
for row in lines:
dict_writer = csv.DictWriter(w, fieldnames=row.keys())
dict_writer.writeheader()
break
# 写入数据行
dict_writer.writerows(lines)
def main(ref, indir, outdir):
finished = load_finished(ref)
filter_([finished, indir, outdir])
if __name__ == '__main__':
from fire import Fire
Fire(main)