How to count the nanopore error rate from the fastq file

Problem Background

Sometimes we want to figure out whether the error rate of the nanopore sequencing is equalized with different nanopore channels.

So we need to group the sequenced data into different channels.

But due to the format of fastq file shown below, we cannot directly obtain the channel information of the specified sequence id with module Biopython(click to enter).

...

@060d64ce-b9f6-42db-b643-2eec557c383b runid=1cd457fdb4ffa37f23fac919b243f20470561dbf sampleid=no_sample read=12841 ch=141 start_time=2023-07-02T14:36:48Z
TCTGTGTACTTCGTTCAGTTACGTATTGCTACTTGCCTGTCGCTCTATCTTCCGCTTTAAGACAACAAACTGGGGCTATGGTTAGGCGCGAAGCCTTTCTTTGTACCGCCTCAGTACTTAGGATCAGCGTCGTATTTAGCCTTCATGGGACCCTTGCCGGCTAGATCTGGACCGAGAAACCTCGCCTTCTACTCAACGTTTCGTCACGCCCAGTTTACGCGTACATGATCCTCCCTGTTGTAGGAGTCTGGTTTCTTCGCCTAGCAATATCAGCACCAACAGAAAAGCAATACG
+
####$%&&''''))+5;==>><:9985.-../9;;<;;;<=1111--.;==;3*)**)'%$$&-/7;@@<:;98889;<>>>?@<<<;:6677=7++)&(*+3:;<;;;;<=333:9::;<;;;;;=;;;989+(()9;<===>@=89988:44679::999<;;<:::8990,,**,1152(''',69::99:;=@=?@<720055555733334:=<:9866//000001<.++,2821,,''(+/1333336:<;32188::;;<>CA434::;?:97556@==9:<;:7,
@fff0126a-bbc8-4958-bd40-571785e2838f runid=1cd457fdb4ffa37f23fac919b243f20470561dbf sampleid=no_sample read=10427 ch=133 start_time=2023-07-02T14:37:38Z
AGCCGCGCGACGAACACGTACATTGCTACTTGCCCTGTCGCTCTATCTTCGTCCAATGAAAGCGTAGTTGCACTCAAATCCGCTTACCAGGCCTTCTTCTGAGATCCAACCATCTTCTGCTATAGGCAACGTGCCTTCTCCCACGTATCTGCTGGTACATCCGCTTGTTCGGCCTTCTTCCAAACCAGCTAATCCAATCCGATGCGTAGAGCGAAGGAACGATGATAGCTTTGGATCCCTAAGAACGGTAGCAATA
+
%%%%%'&'&&&%%$$%%'),***,139889::?A558==4333,--;=>>;;;887799:99:;:9656;<))))(,,--,,-/1011.+++/..'''*455885=7768;32322533/))+(((()*,////2999888335;;<><<===>=====>@<'',,(%$$$3:;===??A///38=;::8:=<;.---======>654++**(('''&+/116:<;:,,+++)''')*5+1499>=;<;;==<<64
@f08985df-31a8-43f0-b42c-2a805124d605 runid=1cd457fdb4ffa37f23fac919b243f20470561dbf sampleid=no_sample read=14188 ch=155 start_time=2023-07-02T14:37:35Z
CTGTATGTACTTCGTTCAGTTACGTATTGCTACTTGCCTGTCGCTCTATCTTCCTTGAGGGAAGCCACAAGCCCAATGGACCAAGTCTCCGCCAAGGCCGAGCAACCGTACGAGAGGAATGCGCGCCCGACCACTTTCTAGGCCTGAATTTCAAAGCCGTTACGAGCGTCCTTTTAACGCGTTAGGCCACTACACCGGCTTCGGCTAAGGAACCTGTGTCTTGAGGCACGCTTCGCCTGGTCCTGTTCTTCGCCTGGTCCTGTTAGCAATATCAGCACCAACAGAAAAGCAATACGT
+
$$$$###$&1278>>?@1111*)))-./134558==>>>??>==>=;<<;<??=<<9666.-1000'''((*89;>AAA@>><=<<????>?@????BB?>???@@>>??A>?>>=>><<>>>BFCA=;***)*/12)/+4368;;=>@CDA875258=>>>><<;;<>=7666<<;:*(()+1//0300/,,)(((&&&&%%%%'('&&%%'''(23458:5///./78:;::;;<<<=>==667BAAB??>><:??;;;::=:::99;;=>?>>>>>?@?A><;:=?==3300-*
...

Solution

Through observing the description of each sequence, we know that all we need to do is obtain the channel info from ch=**.

We can use the regular expression and module QualityIO.FastqGeneralIterator(click to enter) as shown below .

1
2
3

with open(fastq_path) as handle:
    for (title, sequence, quality) in QualityIO.FastqGeneralIterator(handle):
        channel = re.findall(r'ch=\d+', title)[0][3:]

Grouped seqs into files

with tqdm.trange(len(fastq_list), desc='grouping by channel...') as rbar:
    for fastq in fastq_list:
        with open(fastq_path) as handle:
            for (title, sequence, quality) in QualityIO.FastqGeneralIterator(handle):
                id = title.split(' ')[0]
                channel = int(re.findall(r'ch=\d+', title)[0][3:])
                try:
                    # check whether this seq is mapped or not
                    target_name = pickle_data[id]['T_n']
                except: 
                    # this seq does not map to any barcode
                    continue

                '''
                Here are some seqs filter codes
                '''

                if pickle_data[id]['R_s'] == '+':
                    channels[channel - 1].append(id, target_index ,sequence) 
                    # we need to know which barcode this seq maps to, so that we can do error rate count
                else pickle_data[id]['R_s'] == '-':
                    ...
        rbar.update()

Counting error rate per channel

After writing data into files, use MUSCLE to align each query(sequence) with its own reference sequence. And finally write the result into a csv table.

import tqdm, re, subprocess
import matplotlib.pyplot as plt
import pandas as pd
from Bio import SeqIO

grouping_res_path = r'******************************************'
ref_path = r'***************************************************'

all_refs = [str(item.seq) for idx, item in enumerate(SeqIO.parse(ref_path,'fasta'))]

channel_num = 512 # Obtained before
all_mutation_list = []
all_insertion_list = []
all_deletion_list = []
all_id_list = []
avg_mutation_list = []
avg_insertion_list = []
avg_deletion_list = []
avg_id_list = []
for idx in range(1, channel_num+1):
    records = []
    for item in SeqIO.parse(grouping_res_path+f'grouping_channel_{idx}.fasta', 'fasta'):
        title = str(item.id)
        seq = str(item.seq)
        seq_index = int(re.findall(f'channel=\d+', title)[0][8:])
        records.append((seq_index, seq))
    
    mutation_list = []
    insertion_list = []
    deletion_list = []
    id_list = []
    with tqdm.trange(len(records), desc=f'getting grouped result {idx} channels') as pbar:
        for i, item in enumerate(records):
            ref = all_refs[item[0]]
            query = item[1]
            # print('*')
            with open('temp_align.fasta', 'w')as f:
                f.write(f'>query\n')
                f.write(f'{query}\n')
                f.write(f'>ref\n')
                f.write(f'{ref}\n')
            p = subprocess.Popen(('muscle -align {} -output {}').format('temp_align.fasta', 'temp_aligned.fasta'),
                        shell = True,
                        stdout = subprocess.PIPE,
                        stderr = subprocess.STDOUT)
            for line in p.stdout.readlines():
                print('',end = '')
            for _ in SeqIO.parse('temp_aligned.fasta', 'fasta'):
                if str(_.id) == 'query':
                    aligned_query = str(_.seq)
                if str(_.id) == 'ref':
                    aligned_ref = str(_.seq)

            mutation = sum(1 for a, b in zip(aligned_query, aligned_ref) if a != b and not a == '-' and not b == '-')
            insertion = sum(1 for a, b in zip(aligned_query, aligned_ref) if a != '-' and b == '-')
            deletion = sum(1 for a, b in zip(aligned_query, aligned_ref) if a == '-' and b != '-')

            id_list.append(f'{idx}_{i}')
            mutation_list.append(round(mutation/len(ref) ,4))
            insertion_list.append(round(insertion/len(ref) ,4))
            deletion_list.append(round(deletion/len(ref) ,4))

            pbar.update()
    all_mutation_list += mutation_list
    all_insertion_list += insertion_list
    all_deletion_list += deletion_list
    all_id_list += id_list

    avg_mutation = round(sum(mutation_list)/len(mutation_list), 4)
    avg_mutation_list.append(avg_mutation)
    avg_insertion = round(sum(insertion_list)/len(insertion_list), 4)
    avg_insertion_list.append(avg_insertion)
    avg_deletion = round(sum(deletion_list)/len(deletion_list), 4)
    avg_deletion_list.append(avg_deletion)
    avg_id_list.append(f'{idx}')
    
    print(f'\nGrouped seqs {idx} error rate:\n')
    print(f'mutation rate : {avg_mutation}\n')
    print(f'insertion rate : {avg_insertion}\n')
    print(f'deletion rate : {avg_deletion}\n')

df_all = pd.DataFrame({'mutation':all_mutation_list,'insertion':all_insertion_list,'deletion':all_deletion_list},index=all_id_list)
df_all.to_csv('all error rate.csv')
df_avg = pd.DataFrame({'mutation':avg_mutation_list,'insertion':avg_insertion_list,'deletion':avg_deletion_list},index=avg_id_list)
df_avg.to_csv('avg error rate.csv')

Saved file like below:

index	mutation rate	insertion rate	deletion rate
0	0.0492	0.0175	0.0605
1	0.0405	0.0167	0.0496
…	…	…	…
123	0.0434	0.0208	0.0598
…	…	…	…