54 lines
1.2 KiB
Python
54 lines
1.2 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
import os
|
|
import gzip
|
|
import polars as pd
|
|
from glob import glob
|
|
from tqdm import tqdm
|
|
from multiprocessing import Pool
|
|
|
|
|
|
def read_file(args):
|
|
path, nicking = args
|
|
if os.path.getsize(path) < 1:
|
|
return None
|
|
|
|
for i in ["FANCD2", "BRIP1", "RAD51C", "FABCI", "FANCA"]:
|
|
if path.startswith(i):
|
|
return None
|
|
|
|
try:
|
|
df = pd.read_csv(path)
|
|
|
|
if nicking:
|
|
key = os.path.basename(path).split("_nicking")[0]
|
|
df = df.with_columns(sequence_name=pd.lit(key))
|
|
except Exception:
|
|
print(path)
|
|
return None
|
|
|
|
if "low_conf" not in path:
|
|
df = df.with_columns(conf=pd.lit("high"))
|
|
else:
|
|
df = df.with_columns(conf=pd.lit("low"))
|
|
return df
|
|
|
|
|
|
def main(indir, output, nicking=False):
|
|
print(indir, output, nicking)
|
|
|
|
fs = glob(os.path.join(indir, "*"))
|
|
|
|
with Pool(6) as p:
|
|
dfs = list(tqdm(p.imap(read_file, [[x, nicking] for x in fs]), total=len(fs)))
|
|
|
|
df = pd.concat([x for x in dfs if x is not None])
|
|
|
|
with gzip.open(output, "w+") as w:
|
|
df.write_csv(w)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
from fire import Fire
|
|
Fire(main)
|