#!/usr/bin/env python3 # -*- coding: utf-8 -*- import csv import gzip import random import pandas as pd from tqdm import tqdm seed = 42 total_codons = { "丙氨酸": ["GCT", "GCC", "GCA", "GCG"], "精氨酸": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"], "天冬酰胺": ["AAT", "AAC"], "天冬氨酸": ["GAT", "GAC"], "半胱氨酸": ["TGT", "TGC"], "谷氨酰胺": ["CAA", "CAG"], "谷氨酸": ["GAA", "GAG"], "甘氨酸": ["GGT", "GGC", "GGA", "GGG"], "组氨酸": ["CAT", "CAC"], "异亮氨酸": ["ATT", "ATC", "ATA"], "亮氨酸": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"], "赖氨酸": ["AAA", "AAG"], "甲硫氨酸": ["ATG"], "苯丙氨酸": ["TTT", "TTC"], "脯氨酸": ["CCT", "CCC", "CCA", "CCG"], "丝氨酸": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"], "苏氨酸": ["ACT", "ACC", "ACA", "ACG"], "色氨酸": ["TGG"], "酪氨酸": ["TAT", "TAC"], "缬氨酸": ["GTT", "GTC", "GTA", "GTG"], "终止密码子": ["TAA", "TAG", "TGA"] } def load_finished(ref): df = pd.read_excel(ref) genes = {} for _, row in df.iterrows(): gene = row["gene"] pos = row["aa_pos"] if gene not in genes: genes[gene] = set() genes[gene].add(str(pos)) return genes def reader(path): with gzip.open(path, "rt") as r: dict_reader = csv.DictReader(r) for row in tqdm(dict_reader): yield row def filter_(args): finished, path, output = args data = {} for row in reader(path): # 提取各种id key = row['Target_name'].split("_") gene = key[0] pos = key[1].replace("AA", "") dst = key[-1] # uid = f"{key}_{pos}_{dst}" for k, codons in total_codons.items(): if dst in codons: if gene in finished and pos not in finished[gene]: row["dst"] = k row["aa_pos"] = pos row["gene"] = gene if uid not in data: data[uid] = [] data[uid].append(row) dict_writer = None with gzip.open(output, "wt+") as w: for _, lines in tqdm(data.items()): if len(lines) > 2: random.seed(seed) lines = random.sample(lines, 2) if dict_writer is None: for row in lines: dict_writer = csv.DictWriter(w, fieldnames=row.keys()) dict_writer.writeheader() break # 写入数据行 dict_writer.writerows(lines) def main(ref, indir, outdir): finished = load_finished(ref) filter_([finished, indir, outdir]) if __name__ == '__main__': from fire import Fire Fire(main)