0%

A current signal grouping method for SQK-NBD114

First get all reads id from fastq files

Here we use barcode 04, 05 and 06 as examples.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
fastq_dirs = {
"barcode04": r'./sequencing/293T/barcode04',
"barcode05": r'./sequencing/293T/barcode05',
"barcode06": r'./sequencing/293T/barcode06'
}

pod5_id_data = { "barcode04": set(), "barcode05": set(), "barcode06": set() }

def find_fastq_files(folder_path):
folder = Path(folder_path)
return [str(file) for file in folder.rglob('*.fastq')]

for barcode, fastq_dir in fastq_dirs.items():
read_ids = set()
for fastq_file in tqdm(find_fastq_files(fastq_dir), desc=f"Processing {barcode}"):
try:
for record in SeqIO.parse(fastq_file, 'fastq'):
read_ids.add(record.id)
except Exception as e:
print(f"Error reading {fastq_file}: {e}")

pod5_id_data[barcode] = read_ids
print(f"Total read ids in {fastq_dir}: {len(read_ids)}")

Then read POD5 files and store them by barcodes

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
pod5_paths = [f'./pod5/PAS11791_as_an_example_{i}.pod5' for i in range(0,20)]

for pod_idx, pod5_path in enumerate(pod5_paths):
barcode_pod5 = { "barcode04": [], "barcode05": [], "barcode06": [] }
with pod5.DatasetReader(pod5_path, recursive=True) as dataset:
reads = list(dataset.reads())
for read_record in tqdm(reads, desc=f"Processing {os.path.basename(pod5_path)}"):
read_id = read_record.read_id
for barcode, id_set in pod5_id_data.items():
if str(read_id) in id_set:
barcode_pod5[barcode].append(read_record.to_read())

output_pod5s = {
"barcode04": f'./sequencing/293T/regrouped_pod5/barcode04/barcode04_{pod_idx}.pod5',
"barcode05": f'./sequencing/293T/regrouped_pod5/barcode05/barcode05_{pod_idx}.pod5',
"barcode06": f'./sequencing/293T/regrouped_pod5/barcode06/barcode06_{pod_idx}.pod5'
}
for barcode, output_pod5_path in output_pod5s.items():
if barcode_pod5[barcode]:
with pod5.Writer(output_pod5_path) as writer:
writer.add_reads(barcode_pod5[barcode])
print(f"Saved {len(barcode_pod5[barcode])} reads to {output_pod5_path}")

print("Signal extracted. File stored.")