提交饱和编辑的相关设计,及检验代码

This commit is contained in:
2026-02-26 14:02:42 +08:00
commit cb556b47c0
36 changed files with 5437 additions and 0 deletions

87
filter.py Normal file
View File

@@ -0,0 +1,87 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import gzip
from glob import glob
import pandas as pd
def amino_acid_to_codon(amino_acid):
"""
简化的氨基酸到密码子转换函数
参数:
amino_acid (str): 单字母氨基酸代码
返回:
list: 可能的密码子列表
"""
genetic_code = {
'A': ['GCT', 'GCC', 'GCA', 'GCG'],
'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
'N': ['AAT', 'AAC'],
'D': ['GAT', 'GAC'],
'C': ['TGT', 'TGC'],
'E': ['GAA', 'GAG'],
'Q': ['CAA', 'CAG'],
'G': ['GGT', 'GGC', 'GGA', 'GGG'],
'H': ['CAT', 'CAC'],
'I': ['ATT', 'ATC', 'ATA'],
'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
'K': ['AAA', 'AAG'],
'M': ['ATG'],
'F': ['TTT', 'TTC'],
'P': ['CCT', 'CCC', 'CCA', 'CCG'],
'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
'T': ['ACT', 'ACC', 'ACA', 'ACG'],
'W': ['TGG'],
'Y': ['TAT', 'TAC'],
'V': ['GTT', 'GTC', 'GTA', 'GTG'],
'*': ['TAA', 'TAG', 'TGA'],
}
return genetic_code.get(amino_acid.upper(), [])
def main(ref, infile, outfile):
print(infile, outfile)
df = pd.read_excel(ref, 1)
keys = {}
for _, row in df.iterrows():
row = list(row)
for src in amino_acid_to_codon(row[1]):
keys[f"{src}_{row[2]}"] = 0
if os.path.dirname(outfile):
os.makedirs(os.path.dirname(outfile), exist_ok = True)
header = False
with gzip.open(outfile, "wt+") as w:
with gzip.open(infile, "rt") as r:
for line in r:
if not header:
w.write(line.strip() + "\n")
header = line.strip().split(",")
continue
try:
target = header.index("sequence_name")
except ValueError:
target = header.index("Target_name")
key = line.strip().split(",")[target]
key = key.split("_")[2:]
key = "_".join(key).strip('"')
if key in keys:
w.write(line.strip() + "\n")
if __name__ == '__main__':
from fire import Fire
Fire(main)