procas12f/select_primedesign.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import csv
import gzip
import random
import pandas as pd

from tqdm import tqdm

seed = 42


total_codons = {
  "丙氨酸": ["GCT", "GCC", "GCA", "GCG"],
  "精氨酸": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"],
  "天冬酰胺": ["AAT", "AAC"],
  "天冬氨酸": ["GAT", "GAC"],
  "半胱氨酸": ["TGT", "TGC"],
  "谷氨酰胺": ["CAA", "CAG"],
  "谷氨酸": ["GAA", "GAG"],
  "甘氨酸": ["GGT", "GGC", "GGA", "GGG"],
  "组氨酸": ["CAT", "CAC"],
  "异亮氨酸": ["ATT", "ATC", "ATA"],
  "亮氨酸": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"],
  "赖氨酸": ["AAA", "AAG"],
  "甲硫氨酸": ["ATG"],
  "苯丙氨酸": ["TTT", "TTC"],
  "脯氨酸": ["CCT", "CCC", "CCA", "CCG"],
  "丝氨酸": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"],
  "苏氨酸": ["ACT", "ACC", "ACA", "ACG"],
  "色氨酸": ["TGG"],
  "酪氨酸": ["TAT", "TAC"],
  "缬氨酸": ["GTT", "GTC", "GTA", "GTG"],
  "终止密码子": ["TAA", "TAG", "TGA"]
}


def load_finished(ref):
    df = pd.read_excel(ref)

    genes = {}
    for _, row in df.iterrows():
        gene = row["gene"]
        pos = row["aa_pos"]

        if gene not in genes:
            genes[gene] = set()
        genes[gene].add(str(pos))
    return genes


def reader(path):
    with gzip.open(path, "rt") as r:
        dict_reader = csv.DictReader(r)

        for row in tqdm(dict_reader):
            yield row


def filter_(args):
    finished, path, output = args

    data = {}

    for row in reader(path):

        # 提取各种id
        key = row['Target_name'].split("_")
        gene = key[0]
        pos = key[1].replace("AA", "")
        dst = key[-1]

        #
        uid = f"{key}_{pos}_{dst}"
        for k, codons in total_codons.items():
            if dst in codons:
                if gene in finished and pos not in finished[gene]:
                    row["dst"] = k
                    row["aa_pos"] = pos
                    row["gene"] = gene

                    if uid not in data:
                        data[uid] = []
                    data[uid].append(row)

    dict_writer = None
    with gzip.open(output, "wt+") as w:
        for _, lines in tqdm(data.items()):
            if len(lines) > 2:
                random.seed(seed)
                lines = random.sample(lines, 2)

            if dict_writer is None:
                for row in lines:
                    dict_writer = csv.DictWriter(w, fieldnames=row.keys())
                    dict_writer.writeheader()
                    break

            # 写入数据行
            dict_writer.writerows(lines)


def main(ref, indir, outdir):

    finished = load_finished(ref)
    filter_([finished, indir, outdir])


if __name__ == '__main__':
    from fire import Fire
    Fire(main)