1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
| import tqdm, re, subprocess import matplotlib.pyplot as plt import pandas as pd from Bio import SeqIO
grouping_res_path = r'******************************************' ref_path = r'***************************************************'
all_refs = [str(item.seq) for idx, item in enumerate(SeqIO.parse(ref_path,'fasta'))]
channel_num = 512 all_mutation_list = [] all_insertion_list = [] all_deletion_list = [] all_id_list = [] avg_mutation_list = [] avg_insertion_list = [] avg_deletion_list = [] avg_id_list = [] for idx in range(1, channel_num+1): records = [] for item in SeqIO.parse(grouping_res_path+f'grouping_channel_{idx}.fasta', 'fasta'): title = str(item.id) seq = str(item.seq) seq_index = int(re.findall(f'channel=\d+', title)[0][8:]) records.append((seq_index, seq)) mutation_list = [] insertion_list = [] deletion_list = [] id_list = [] with tqdm.trange(len(records), desc=f'getting grouped result {idx} channels') as pbar: for i, item in enumerate(records): ref = all_refs[item[0]] query = item[1] with open('temp_align.fasta', 'w')as f: f.write(f'>query\n') f.write(f'{query}\n') f.write(f'>ref\n') f.write(f'{ref}\n') p = subprocess.Popen(('muscle -align {} -output {}').format('temp_align.fasta', 'temp_aligned.fasta'), shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT) for line in p.stdout.readlines(): print('',end = '') for _ in SeqIO.parse('temp_aligned.fasta', 'fasta'): if str(_.id) == 'query': aligned_query = str(_.seq) if str(_.id) == 'ref': aligned_ref = str(_.seq)
mutation = sum(1 for a, b in zip(aligned_query, aligned_ref) if a != b and not a == '-' and not b == '-') insertion = sum(1 for a, b in zip(aligned_query, aligned_ref) if a != '-' and b == '-') deletion = sum(1 for a, b in zip(aligned_query, aligned_ref) if a == '-' and b != '-')
id_list.append(f'{idx}_{i}') mutation_list.append(round(mutation/len(ref) ,4)) insertion_list.append(round(insertion/len(ref) ,4)) deletion_list.append(round(deletion/len(ref) ,4))
pbar.update() all_mutation_list += mutation_list all_insertion_list += insertion_list all_deletion_list += deletion_list all_id_list += id_list
avg_mutation = round(sum(mutation_list)/len(mutation_list), 4) avg_mutation_list.append(avg_mutation) avg_insertion = round(sum(insertion_list)/len(insertion_list), 4) avg_insertion_list.append(avg_insertion) avg_deletion = round(sum(deletion_list)/len(deletion_list), 4) avg_deletion_list.append(avg_deletion) avg_id_list.append(f'{idx}') print(f'\nGrouped seqs {idx} error rate:\n') print(f'mutation rate : {avg_mutation}\n') print(f'insertion rate : {avg_insertion}\n') print(f'deletion rate : {avg_deletion}\n')
df_all = pd.DataFrame({'mutation':all_mutation_list,'insertion':all_insertion_list,'deletion':all_deletion_list},index=all_id_list) df_all.to_csv('all error rate.csv') df_avg = pd.DataFrame({'mutation':avg_mutation_list,'insertion':avg_insertion_list,'deletion':avg_deletion_list},index=avg_id_list) df_avg.to_csv('avg error rate.csv')
|