88 lines
2.2 KiB
Python
88 lines
2.2 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
import os
|
|
import gzip
|
|
from glob import glob
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
def amino_acid_to_codon(amino_acid):
|
|
"""
|
|
简化的氨基酸到密码子转换函数
|
|
|
|
参数:
|
|
amino_acid (str): 单字母氨基酸代码
|
|
|
|
返回:
|
|
list: 可能的密码子列表
|
|
"""
|
|
genetic_code = {
|
|
'A': ['GCT', 'GCC', 'GCA', 'GCG'],
|
|
'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
|
|
'N': ['AAT', 'AAC'],
|
|
'D': ['GAT', 'GAC'],
|
|
'C': ['TGT', 'TGC'],
|
|
'E': ['GAA', 'GAG'],
|
|
'Q': ['CAA', 'CAG'],
|
|
'G': ['GGT', 'GGC', 'GGA', 'GGG'],
|
|
'H': ['CAT', 'CAC'],
|
|
'I': ['ATT', 'ATC', 'ATA'],
|
|
'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
|
|
'K': ['AAA', 'AAG'],
|
|
'M': ['ATG'],
|
|
'F': ['TTT', 'TTC'],
|
|
'P': ['CCT', 'CCC', 'CCA', 'CCG'],
|
|
'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
|
|
'T': ['ACT', 'ACC', 'ACA', 'ACG'],
|
|
'W': ['TGG'],
|
|
'Y': ['TAT', 'TAC'],
|
|
'V': ['GTT', 'GTC', 'GTA', 'GTG'],
|
|
'*': ['TAA', 'TAG', 'TGA'],
|
|
}
|
|
|
|
return genetic_code.get(amino_acid.upper(), [])
|
|
|
|
|
|
|
|
def main(ref, infile, outfile):
|
|
print(infile, outfile)
|
|
df = pd.read_excel(ref, 1)
|
|
|
|
keys = {}
|
|
for _, row in df.iterrows():
|
|
row = list(row)
|
|
for src in amino_acid_to_codon(row[1]):
|
|
keys[f"{src}_{row[2]}"] = 0
|
|
|
|
if os.path.dirname(outfile):
|
|
os.makedirs(os.path.dirname(outfile), exist_ok = True)
|
|
|
|
header = False
|
|
with gzip.open(outfile, "wt+") as w:
|
|
with gzip.open(infile, "rt") as r:
|
|
for line in r:
|
|
if not header:
|
|
w.write(line.strip() + "\n")
|
|
header = line.strip().split(",")
|
|
continue
|
|
|
|
try:
|
|
target = header.index("sequence_name")
|
|
except ValueError:
|
|
target = header.index("Target_name")
|
|
|
|
key = line.strip().split(",")[target]
|
|
key = key.split("_")[2:]
|
|
key = "_".join(key).strip('"')
|
|
|
|
if key in keys:
|
|
w.write(line.strip() + "\n")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
from fire import Fire
|
|
Fire(main)
|
|
|