#!/usr/bin/env python3 # -*- coding: utf-8 -*- import csv import os import gzip from glob import glob from tqdm import tqdm import pandas as pd def load_left_aa(ref): df = pd.read_csv(ref) aa = set() for _, row in df.iterrows(): gene, pos = row["gene"], row["aa_pos"] aa.add(f"{gene}_AA{pos}") return aa def reader(path): with gzip.open(path, "rt") as r: dict_reader = csv.DictReader(r) for row in tqdm(dict_reader): yield row def process_seq(sequence: str): before, codon = sequence.split("(") codon, after = codon.split(")") src, dst = codon.split("/") mismatch_codon = sum([x != y for x, y in zip(src, dst)]) if mismatch_codon == 1: if src[:2] == dst[:2]: before += src[:2] return f"{before}({src[-1]}/{dst[-1]}){after}" if src[1:] == dst[1:]: after = src[1:] + after return f"{before}({src[0]}/{dst[0]}){after}" before += src[0] after = src[-1] + after return f"{before}({src[1]}/{dst[1]}){after}" elif mismatch_codon == 2: if src[0] == dst[0]: before = before + src[0] return f"{before}({src[1:]}/{dst[1:]}){after}" if src[-1] == dst[-1]: after = src[-1] + after return f"{before}({src[:2]}/{dst[:2]}){after}" return None # return sequence def main(ref, infile, outfile): ref = load_left_aa(ref) data = [] for file in glob(infile): for row in reader(file): seq_name = row["sequence_name"].split("_")[:2] seq_name = "_".join(seq_name) if seq_name in ref: row["editseq"] = process_seq(row["editseq"]) if row["editseq"]: row.pop("strategy") row.pop("mutation_type") data.append(row) with gzip.open(outfile, "wt+") as w: dict_writer = csv.DictWriter(w, fieldnames=data[0].keys()) dict_writer.writeheader() # 写入数据行 dict_writer.writerows(data) if __name__ == '__main__': from fire import Fire Fire(main)