114 lines
2.7 KiB
Python
114 lines
2.7 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
import csv
|
|
import gzip
|
|
import random
|
|
import pandas as pd
|
|
|
|
from tqdm import tqdm
|
|
|
|
seed = 42
|
|
|
|
|
|
total_codons = {
|
|
"丙氨酸": ["GCT", "GCC", "GCA", "GCG"],
|
|
"精氨酸": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"],
|
|
"天冬酰胺": ["AAT", "AAC"],
|
|
"天冬氨酸": ["GAT", "GAC"],
|
|
"半胱氨酸": ["TGT", "TGC"],
|
|
"谷氨酰胺": ["CAA", "CAG"],
|
|
"谷氨酸": ["GAA", "GAG"],
|
|
"甘氨酸": ["GGT", "GGC", "GGA", "GGG"],
|
|
"组氨酸": ["CAT", "CAC"],
|
|
"异亮氨酸": ["ATT", "ATC", "ATA"],
|
|
"亮氨酸": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"],
|
|
"赖氨酸": ["AAA", "AAG"],
|
|
"甲硫氨酸": ["ATG"],
|
|
"苯丙氨酸": ["TTT", "TTC"],
|
|
"脯氨酸": ["CCT", "CCC", "CCA", "CCG"],
|
|
"丝氨酸": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"],
|
|
"苏氨酸": ["ACT", "ACC", "ACA", "ACG"],
|
|
"色氨酸": ["TGG"],
|
|
"酪氨酸": ["TAT", "TAC"],
|
|
"缬氨酸": ["GTT", "GTC", "GTA", "GTG"],
|
|
"终止密码子": ["TAA", "TAG", "TGA"]
|
|
}
|
|
|
|
|
|
def load_finished(ref):
|
|
df = pd.read_excel(ref)
|
|
|
|
genes = {}
|
|
for _, row in df.iterrows():
|
|
gene = row["gene"]
|
|
pos = row["aa_pos"]
|
|
|
|
if gene not in genes:
|
|
genes[gene] = set()
|
|
genes[gene].add(str(pos))
|
|
return genes
|
|
|
|
|
|
def reader(path):
|
|
with gzip.open(path, "rt") as r:
|
|
dict_reader = csv.DictReader(r)
|
|
|
|
for row in tqdm(dict_reader):
|
|
yield row
|
|
|
|
|
|
def filter_(args):
|
|
finished, path, output = args
|
|
|
|
data = {}
|
|
|
|
for row in reader(path):
|
|
|
|
# 提取各种id
|
|
key = row['Target_name'].split("_")
|
|
gene = key[0]
|
|
pos = key[1].replace("AA", "")
|
|
dst = key[-1]
|
|
|
|
#
|
|
uid = f"{key}_{pos}_{dst}"
|
|
for k, codons in total_codons.items():
|
|
if dst in codons:
|
|
if gene in finished and pos not in finished[gene]:
|
|
row["dst"] = k
|
|
row["aa_pos"] = pos
|
|
row["gene"] = gene
|
|
|
|
if uid not in data:
|
|
data[uid] = []
|
|
data[uid].append(row)
|
|
|
|
dict_writer = None
|
|
with gzip.open(output, "wt+") as w:
|
|
for _, lines in tqdm(data.items()):
|
|
if len(lines) > 2:
|
|
random.seed(seed)
|
|
lines = random.sample(lines, 2)
|
|
|
|
if dict_writer is None:
|
|
for row in lines:
|
|
dict_writer = csv.DictWriter(w, fieldnames=row.keys())
|
|
dict_writer.writeheader()
|
|
break
|
|
|
|
# 写入数据行
|
|
dict_writer.writerows(lines)
|
|
|
|
|
|
def main(ref, indir, outdir):
|
|
|
|
finished = load_finished(ref)
|
|
filter_([finished, indir, outdir])
|
|
|
|
|
|
if __name__ == '__main__':
|
|
from fire import Fire
|
|
Fire(main)
|
|
|
|
|