deffind_fastq_files(folder_path): folder = Path(folder_path) return [str(file) for file in folder.rglob('*.fastq')]
for barcode, fastq_dir in fastq_dirs.items(): read_ids = set() for fastq_file in tqdm(find_fastq_files(fastq_dir), desc=f"Processing {barcode}"): try: for record in SeqIO.parse(fastq_file, 'fastq'): read_ids.add(record.id) except Exception as e: print(f"Error reading {fastq_file}: {e}") pod5_id_data[barcode] = read_ids print(f"Total read ids in {fastq_dir}: {len(read_ids)}")
pod5_paths = [f'./pod5/PAS11791_as_an_example_{i}.pod5'for i inrange(0,20)]
for pod_idx, pod5_path inenumerate(pod5_paths): barcode_pod5 = { "barcode04": [], "barcode05": [], "barcode06": [] } with pod5.DatasetReader(pod5_path, recursive=True) as dataset: reads = list(dataset.reads()) for read_record in tqdm(reads, desc=f"Processing {os.path.basename(pod5_path)}"): read_id = read_record.read_id for barcode, id_set in pod5_id_data.items(): ifstr(read_id) in id_set: barcode_pod5[barcode].append(read_record.to_read()) output_pod5s = { "barcode04": f'./sequencing/293T/regrouped_pod5/barcode04/barcode04_{pod_idx}.pod5', "barcode05": f'./sequencing/293T/regrouped_pod5/barcode05/barcode05_{pod_idx}.pod5', "barcode06": f'./sequencing/293T/regrouped_pod5/barcode06/barcode06_{pod_idx}.pod5' } for barcode, output_pod5_path in output_pod5s.items(): if barcode_pod5[barcode]: with pod5.Writer(output_pod5_path) as writer: writer.add_reads(barcode_pod5[barcode]) print(f"Saved {len(barcode_pod5[barcode])} reads to {output_pod5_path}")