#!/usr/bin/env python # -*- coding: utf-8 -*- import os import gzip import polars as pd from glob import glob from tqdm import tqdm from multiprocessing import Pool def read_file(args): path, nicking = args if os.path.getsize(path) < 1: return None for i in ["FANCD2", "BRIP1", "RAD51C", "FABCI", "FANCA"]: if path.startswith(i): return None try: df = pd.read_csv(path) if nicking: key = os.path.basename(path).split("_nicking")[0] df = df.with_columns(sequence_name=pd.lit(key)) except Exception: print(path) return None if "low_conf" not in path: df = df.with_columns(conf=pd.lit("high")) else: df = df.with_columns(conf=pd.lit("low")) return df def main(indir, output, nicking=False): print(indir, output, nicking) fs = glob(os.path.join(indir, "*")) with Pool(6) as p: dfs = list(tqdm(p.imap(read_file, [[x, nicking] for x in fs]), total=len(fs))) df = pd.concat([x for x in dfs if x is not None]) with gzip.open(output, "w+") as w: df.write_csv(w) if __name__ == '__main__': from fire import Fire Fire(main)