#!/usr/bin/env python # -*- coding: utf-8 -*- import os import gzip from glob import glob import pandas as pd def amino_acid_to_codon(amino_acid): """ 简化的氨基酸到密码子转换函数 参数: amino_acid (str): 单字母氨基酸代码 返回: list: 可能的密码子列表 """ genetic_code = { 'A': ['GCT', 'GCC', 'GCA', 'GCG'], 'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'], 'N': ['AAT', 'AAC'], 'D': ['GAT', 'GAC'], 'C': ['TGT', 'TGC'], 'E': ['GAA', 'GAG'], 'Q': ['CAA', 'CAG'], 'G': ['GGT', 'GGC', 'GGA', 'GGG'], 'H': ['CAT', 'CAC'], 'I': ['ATT', 'ATC', 'ATA'], 'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'], 'K': ['AAA', 'AAG'], 'M': ['ATG'], 'F': ['TTT', 'TTC'], 'P': ['CCT', 'CCC', 'CCA', 'CCG'], 'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'], 'T': ['ACT', 'ACC', 'ACA', 'ACG'], 'W': ['TGG'], 'Y': ['TAT', 'TAC'], 'V': ['GTT', 'GTC', 'GTA', 'GTG'], '*': ['TAA', 'TAG', 'TGA'], } return genetic_code.get(amino_acid.upper(), []) def main(ref, infile, outfile): print(infile, outfile) df = pd.read_excel(ref, 1) keys = {} for _, row in df.iterrows(): row = list(row) for src in amino_acid_to_codon(row[1]): keys[f"{src}_{row[2]}"] = 0 if os.path.dirname(outfile): os.makedirs(os.path.dirname(outfile), exist_ok = True) header = False with gzip.open(outfile, "wt+") as w: with gzip.open(infile, "rt") as r: for line in r: if not header: w.write(line.strip() + "\n") header = line.strip().split(",") continue try: target = header.index("sequence_name") except ValueError: target = header.index("Target_name") key = line.strip().split(",")[target] key = key.split("_")[2:] key = "_".join(key).strip('"') if key in keys: w.write(line.strip() + "\n") if __name__ == '__main__': from fire import Fire Fire(main)