提交饱和编辑的相关设计,及检验代码
This commit is contained in:
210
filter_freq.py
Normal file
210
filter_freq.py
Normal file
@@ -0,0 +1,210 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import gzip
|
||||
import heapq
|
||||
import click
|
||||
import csv
|
||||
import polars as pl
|
||||
|
||||
from multiprocessing import Pool
|
||||
|
||||
from glob import glob
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def amino_acid_to_codon():
|
||||
"""
|
||||
简化的氨基酸到密码子转换函数
|
||||
|
||||
参数:
|
||||
amino_acid (str): 单字母氨基酸代码
|
||||
|
||||
返回:
|
||||
list: 可能的密码子列表
|
||||
"""
|
||||
genetic_code = {
|
||||
'A': ['GCT', 'GCC', 'GCA', 'GCG'],
|
||||
'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
|
||||
'N': ['AAT', 'AAC'],
|
||||
'D': ['GAT', 'GAC'],
|
||||
'C': ['TGT', 'TGC'],
|
||||
'E': ['GAA', 'GAG'],
|
||||
'Q': ['CAA', 'CAG'],
|
||||
'G': ['GGT', 'GGC', 'GGA', 'GGG'],
|
||||
'H': ['CAT', 'CAC'],
|
||||
'I': ['ATT', 'ATC', 'ATA'],
|
||||
'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
|
||||
'K': ['AAA', 'AAG'],
|
||||
'M': ['ATG'],
|
||||
'F': ['TTT', 'TTC'],
|
||||
'P': ['CCT', 'CCC', 'CCA', 'CCG'],
|
||||
'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
|
||||
'T': ['ACT', 'ACC', 'ACA', 'ACG'],
|
||||
'W': ['TGG'],
|
||||
'Y': ['TAT', 'TAC'],
|
||||
'V': ['GTT', 'GTC', 'GTA', 'GTG'],
|
||||
'*': ['TAA', 'TAG', 'TGA'],
|
||||
}
|
||||
|
||||
codes = []
|
||||
for val in genetic_code.values():
|
||||
codes += val
|
||||
|
||||
return set(codes) # genetic_code.get(amino_acid.upper(), [])
|
||||
|
||||
__CODONS__ = amino_acid_to_codon()
|
||||
|
||||
|
||||
def reader(path: str, rt_len: int = 24):
|
||||
"""
|
||||
流式读取 CSV 文件,逐行返回 dict。
|
||||
内存占用恒定(只缓存一行),适合 GB 级文件。
|
||||
"""
|
||||
with gzip.open(path, "rt", newline="") as f:
|
||||
for row in csv.DictReader(f):
|
||||
try:
|
||||
if float(row["RTlength"]) <= rt_len:
|
||||
yield row
|
||||
except TypeError:
|
||||
continue
|
||||
|
||||
|
||||
def __check_target__(key: str):
|
||||
if ">" in key:
|
||||
key = key.replace(">", "_")
|
||||
|
||||
keys = key.split("_")
|
||||
|
||||
return keys[-1] in __CODONS__
|
||||
|
||||
|
||||
def __decode_codon_n__(key: str) -> str:
|
||||
# BRIP1_AA580_CTC_CTT
|
||||
|
||||
if ">" in key:
|
||||
key = key.replace(">", "_")
|
||||
|
||||
keys = key.split("_")
|
||||
|
||||
try:
|
||||
res = []
|
||||
for x, y in zip(keys[-2], keys[-1]):
|
||||
if x == y:
|
||||
res.append(x)
|
||||
else:
|
||||
res.append("N")
|
||||
keys[-1] = "".join(res)
|
||||
except IndexError as err:
|
||||
print(keys)
|
||||
raise err
|
||||
return "_".join(keys)
|
||||
|
||||
def __call_func__(args):
|
||||
u""" 实际处理代码 """
|
||||
f, outdir, top_n, degenerate = args
|
||||
|
||||
data = {}
|
||||
|
||||
# 读取文件
|
||||
for rec in tqdm(reader(f)):
|
||||
|
||||
# 根据设定好的sequence名称
|
||||
key = rec["sequence_name"]
|
||||
|
||||
if not __check_target__(key):
|
||||
# 如果target不是已知的编码氨基酸的codon则跳过
|
||||
continue
|
||||
|
||||
if degenerate:
|
||||
try:
|
||||
key = __decode_codon_n__(rec["sequence_name"])
|
||||
rec["orig_seq_name"] = rec.pop("sequence_name")
|
||||
rec["sequence_name"] = key
|
||||
except IndexError:
|
||||
continue
|
||||
|
||||
if key not in data:
|
||||
data[key] = []
|
||||
|
||||
# 数据heap化
|
||||
if "DeepCas9score" in rec.keys():
|
||||
k = "DeepCas9score"
|
||||
elif "PRIDICT2_0_editing_Score_deep_K562" in rec.keys():
|
||||
k = "PRIDICT2_0_editing_Score_deep_K562"
|
||||
else:
|
||||
print(f, rec)
|
||||
continue
|
||||
# raise ValueError(f"PRIDICT2_0_editing_Score_deep_K562 not exists in {f}")
|
||||
|
||||
try:
|
||||
score = float(rec[k])
|
||||
except (ValueError, KeyError) as e:
|
||||
print(f"Warning: Skipping invalid record in {f}: {rec}")
|
||||
continue # 或 raise,根据需求
|
||||
|
||||
if len(data[key]) < top_n:
|
||||
heapq.heappush(data[key], (score, rec))
|
||||
else:
|
||||
try:
|
||||
if score > data[key][0][0]:
|
||||
heapq.heapreplace(data[key], (score, rec))
|
||||
except TypeError as err:
|
||||
print(err)
|
||||
print(key)
|
||||
print(score)
|
||||
print(len(data[key]))
|
||||
raise err
|
||||
|
||||
# 第二遍:整理结果(按 score 降序)
|
||||
final_records = []
|
||||
for heap in data.values():
|
||||
# 从堆中取出并按 score 降序排列
|
||||
sorted_recs = [rec for _, rec in sorted(heap, key=lambda x: x[0], reverse=True)]
|
||||
final_records.extend(sorted_recs)
|
||||
|
||||
if not final_records:
|
||||
print(f"No valid records in {f}, skipping output.")
|
||||
return
|
||||
|
||||
# 安全写入 CSV(使用 csv 模块)
|
||||
output_path = os.path.join(outdir, os.path.basename(f))
|
||||
with gzip.open(output_path, "wt+", newline="", encoding="utf-8") as w:
|
||||
writer = csv.DictWriter(w, fieldnames=final_records[0].keys(), quoting=csv.QUOTE_MINIMAL)
|
||||
writer.writeheader()
|
||||
writer.writerows(final_records)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option("-i", "--indir", type=str, help="字符串形式的输入路径,可以*通配多个文件和目录")
|
||||
@click.option("-o", "--outdir", type=str, help="输出目录")
|
||||
@click.option("-t", "--top-n", type=int, help="选择前几", default=3)
|
||||
@click.option("-n", "--degenerate", is_flag=True, help="是否使用兼并碱基")
|
||||
@click.argument('args', nargs=-1) # 捕获所有位置参数
|
||||
def main(indir, outdir, top_n, degenerate, args):
|
||||
|
||||
if not indir and len(args) > 0:
|
||||
indir = args[0]
|
||||
if not outdir and len(args) > 0:
|
||||
outdir = args[-1]
|
||||
|
||||
if indir == outdir:
|
||||
raise ValueError("indir and outdir should not be the same")
|
||||
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
|
||||
# 获取输入文件,生成参数
|
||||
args = [[f, outdir, top_n, degenerate] for f in glob(indir)]
|
||||
|
||||
# for arg in args:
|
||||
# print(arg[0])
|
||||
# __call_func__(arg)
|
||||
|
||||
with Pool(len(args)) as p:
|
||||
list(tqdm(p.imap(__call_func__, args), total=len(args)))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user