import pandas as pd import glob, subprocess from pathlib import Path from sklearn.metrics import jaccard_score from plotly.subplots import make_subplots import plotly.graph_objects as go import numpy as np from scipy import stats import plotly.express as px import statsmodels.api as sm from statistics import mean from requests_html import HTMLSession import xmltodict, requests from tqdm import tqdm from taxontabletools.taxontable_manipulation import strip_metadata import statistics as stat from Bio import SeqIO from io import StringIO from Bio.Seq import Seq from itertools import product def read_to_concentration_correlation(): taxonomic_level_1 = 'Species' fish_mc = pd.read_excel('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/Raw_tables/Fish_MC_composition.xlsx') reference_species = [i.strip() for i in fish_mc[taxonomic_level_1].values.tolist()] MC_rand_concentration = {i[0]:i[1] for i in fish_mc[[taxonomic_level_1, 'MC_rand']].values.tolist()} alpha_diversity_dict = {} # collect number of species tables = sorted(glob.glob('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/TaXon_tables/4_adjusted/*.xlsx')) species_dict = {} for table in tables: table = Path(table) name = str(table.stem).split('_')[1] df = pd.read_excel(table).fillna('') df = strip_metadata(df) samples = list(df.columns[10:]) sample_species = list(set([i for i in df[taxonomic_level_1].values.tolist() if i != ''])) jaccard_sample = [] jaccard_reference = [] for species in set(sample_species + reference_species): if species in sample_species: jaccard_sample.append(1) else: jaccard_sample.append(0) if species in reference_species: jaccard_reference.append(1) else: jaccard_reference.append(0) score = round(jaccard_score(jaccard_sample, jaccard_reference), 2) print(name, score) # Shared and exclusive taxa n_total = len(set(sample_species + reference_species)) additional = set(sample_species) - set(reference_species) additional_len = len(additional) / n_total * 100 not_found = set(reference_species) - set(sample_species) not_found_len = len(not_found) / n_total * 100 shared = set(sample_species) & set(reference_species) shared_len = len(shared) / n_total * 100 alpha_diversity_dict[name] = [shared_len, additional_len, not_found_len] x_values = [] y_values = [] for species in shared: x_values.append(np.log(MC_rand_concentration[species])) reads = np.log(sum([sum(i[1:]) for i in df[['Species'] + samples].values.tolist() if i[0] == species])) y_values.append(reads) spearman = stats.spearmanr(x_values, y_values) c = round(spearman[0], 2) p = spearman[1] if p <= 0.05: plot_title = '{} (spearman={}*)'.format(name, c) else: plot_title = '{} (spearman={})'.format(name, c) fig = go.Figure() df = pd.DataFrame({'X':x_values, 'Y':y_values}) df['bestfit'] = sm.OLS(df['Y'],sm.add_constant(df['X'])).fit().fittedvalues fig.add_trace(go.Scatter(name="rho(reads)=" + str(c), x=x_values, y=df['bestfit'], mode='lines', marker=dict(color='lightgrey'))) fig.add_trace(go.Scatter(x=x_values, y=y_values, mode='markers', marker_color='navy', marker=dict(size=15), name=c)) fig.update_xaxes(title = 'DNA concentration (log)') fig.update_yaxes(title = 'MC1 reads (log)') fig.update_layout(template='simple_white', title=plot_title, width=800, height=600, font_size=16, showlegend=False) out = Path('{}/concentration_{}.pdf'.format('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/Fish_MC_plots', name)) fig.write_image(out) fig = go.Figure() colors = ['white', 'lightblue', 'navy'] for j,status in enumerate(['true positives', 'false negatives', 'false positives']): x_values = list(alpha_diversity_dict.keys()) y_values = [i[j] for i in list(alpha_diversity_dict.values())] fig.add_trace(go.Bar(x=x_values, y=y_values, marker_color=colors[j], name=status, marker_line_color='black', marker_line_width=.2)) print(status, x_values, [round(i,1) for i in y_values]) fig.update_layout(template='simple_white', barmode='stack', width=600, height=400, font_size=16, showlegend=False) fig.update_yaxes(title='fish {} (%)'.format(taxonomic_level_1.lower()), dtick=10) out = Path('{}/add_shared_excl_{}.pdf'.format('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/Fish_MC_plots', taxonomic_level_1.lower())) fig.write_image(out) def bycatch(): raw_tables = sorted(glob.glob('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/TaXon_tables/2_NCsub/*.xlsx')) bycatch_ratio_dict = {} for table in raw_tables: table = Path(table) name = str(table.stem).split('_')[1] df = pd.read_excel(table).fillna('No Match') df = strip_metadata(df) bycatch_taxa = list(set([i for i in df['Class'].values.tolist() if i not in ['Actinopteri', 'Hyperoartia'] and i != ''])) n_total = len(df) bycatch_sum = sum([df['Class'].values.tolist().count(taxon) for taxon in bycatch_taxa]) r1 = bycatch_sum / n_total * 100 fish_sum = sum([df['Class'].values.tolist().count(taxon) for taxon in ['Actinopteri', 'Hyperoartia']]) r2 = fish_sum / n_total * 100 bycatch_ratio_dict[name] = [r2, r1] fig = go.Figure() colors = ['white', 'navy'] for j,status in enumerate(['fish', 'bycatch']): x_values = list(bycatch_ratio_dict.keys()) y_values = [i[j] for i in list(bycatch_ratio_dict.values())] fig.add_trace(go.Bar(x=x_values, y=y_values, marker_color=colors[j], name=status, marker_line_color='black', marker_line_width=.2)) fig.update_layout(template='simple_white', barmode='stack', width=800, height=600, font_size=22, showlegend=True) fig.update_yaxes(title='OTUs (%)', dtick=10) out = Path('{}/{}.pdf'.format('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/Fish_MC_plots', 'bycatch_fish')) fig.write_image(out) ## raw_tables = sorted(glob.glob('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/TaXon_tables/2_NCsub/*.xlsx')) bycatch_ratio_dict = {} for table in raw_tables: table = Path(table) name = str(table.stem).split('_')[1] df = pd.read_excel(table).fillna('No Match') df = strip_metadata(df) sample_names = df.columns.tolist()[10:] bycatch_taxa = list(set([i for i in df['Class'].values.tolist() if i not in ['Actinopteri', 'Hyperoartia'] and i != ''])) total_reads = sum([sum(i) for i in df[sample_names].values.tolist()]) bycatch_reads, fish_reads = [], [] for OTU in df.values.tolist(): class_ = OTU[2] if class_ in bycatch_taxa: bycatch_reads.append(sum(OTU[10:])) else: fish_reads.append(sum(OTU[10:])) r1 = sum(bycatch_reads) / total_reads * 100 r2 = sum(fish_reads) / total_reads * 100 bycatch_ratio_dict[name] = [r2, r1] fig = go.Figure() colors = ['white', 'navy'] for j,status in enumerate(['Fish', 'Bycatch']): x_values = list(bycatch_ratio_dict.keys()) y_values = [i[j] for i in list(bycatch_ratio_dict.values())] fig.add_trace(go.Bar(x=x_values, y=y_values, marker_color=colors[j], name=status, marker_line_color='black', marker_line_width=.2)) fig.update_layout(template='simple_white', barmode='stack', width=800, height=600, font_size=22, showlegend=False) fig.update_yaxes(title='Reads (%)', dtick=10) out = Path('{}/{}.pdf'.format('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/Fish_MC_plots', 'bycatch_fish_reads')) fig.write_image(out) def read_abundance(): taxonomic_level_1 = 'Species' taxonomic_level_2 = 'Family' fish_mc = pd.read_excel('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/Raw_tables/Fish_MC_composition.xlsx') reference_species = [i.strip() for i in fish_mc[taxonomic_level_1].values.tolist()] ## create family specific colors colors = px.colors.qualitative.Alphabet family_dict = {i[0]:i[1] for i in fish_mc[[taxonomic_level_1, taxonomic_level_2]].values.tolist()} families = sorted(set(family_dict.values())) color_families = {j:colors[i] for i, j in enumerate(families)} color_dict = {i[0]:color_families[i[1]] for i in fish_mc[[taxonomic_level_1, taxonomic_level_2]].values.tolist()} # collect number of species tables = sorted(glob.glob('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/TaXon_tables/4_adjusted/*.xlsx')) subplot_titles = [str(Path(i).stem).split('_')[1] for i in tables] fig = make_subplots(rows=5, cols=1, shared_xaxes=True, subplot_titles=subplot_titles) row_counter = 1 for table in tables: table = Path(table) name = str(table.stem).split('_')[1] df = pd.read_excel(table).fillna('') df = strip_metadata(df) sample_names = df.columns.tolist()[10:] sample_species = list(set([i for i in df[taxonomic_level_1].values.tolist() if i != ''])) reads_total = sum([sum(i[10:]) for i in df.values.tolist()]) # rel read abundance dict read_abundance_dict = {} for species in reference_species: species_occurrence = [] for sample in sample_names: n_reads = sum([i[0] for i in df[[sample, 'Species']].values.tolist() if i[1] == species and i[0] != 0]) if n_reads != 0: n_reads = np.log(n_reads) species_occurrence.append(n_reads) read_abundance_dict[species] = species_occurrence x_values = ['{}'.format(i) for i in list(read_abundance_dict.keys())] y_values = list(read_abundance_dict.values()) color_list = list(color_dict.values()) ## adjust y_values to highlight absent species y_values2 = [0 if sum(i) != 0 else 10 for i in y_values] for box in zip(x_values, y_values, color_list): fig.add_trace(go.Box(y=box[1], name=box[0], marker_color=box[2], line_width=1, marker_size=2), row=row_counter, col=1) fig.add_trace(go.Bar(x=x_values, y=y_values2, marker_color='AliceBlue'), row=row_counter, col=1) row_counter += 1 fig.update_layout(height=1000, width=1000, template='simple_white', showlegend=False) fig.update_xaxes(tickmode='linear') fig.update_yaxes(title='log reads') out = Path('{}/{}.pdf'.format('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/Fish_MC_plots', 'primer_comparison')) fig.write_image(out) def MC_MCrand_correlation(): # collect number of species tables = sorted(glob.glob('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/TaXon_tables/4_adjusted/*.xlsx')) subplot_titles = [str(Path(i).stem).split('_')[1] for i in tables] taxonomic_level_1 = 'Species' for table in tables: table = Path(table) name = str(table.stem).split('_')[1] df = pd.read_excel(table).fillna('') df = strip_metadata(df) sample_names = df.columns.tolist()[10:] sample_species = list(set([i for i in df[taxonomic_level_1].values.tolist() if i != ''])) MC_samples = [i for i in sample_names if 'MC_' in i] MC_rand_samples = [i for i in sample_names if 'MC-' in i] x_values = [] y_values = [] names = [] for species in sample_species: MC_reads = sum([sum(i[1:]) for i in df[[taxonomic_level_1] + MC_samples].values.tolist() if i[0] == species]) x_values.append(MC_reads) MC_rand_reads = sum([sum(i[1:]) for i in df[[taxonomic_level_1] + MC_rand_samples].values.tolist() if i[0] == species]) y_values.append(MC_rand_reads) names.append(species) ## convert reads to log x_values = [np.log(i) if i != 0 else 0 for i in x_values] y_values = [np.log(i) if i != 0 else 0 for i in y_values] spearman = stats.spearmanr(x_values, y_values) c = round(spearman[0], 2) p = spearman[1] if p <= 0.05: plot_title = '{} (spearman={}*)'.format(name, c) else: plot_title = '{} (spearman={})'.format(name, c) print(plot_title) fig = go.Figure() df = pd.DataFrame({'X':x_values, 'Y':y_values}) df['bestfit'] = sm.OLS(df['Y'],sm.add_constant(df['X'])).fit().fittedvalues fig.add_trace(go.Scatter(name="rho(reads)=" + str(c), x=x_values, y=df['bestfit'], mode='lines', marker=dict(color='lightgrey'))) fig.add_trace(go.Scatter(x=x_values, y=y_values, hovertext=names, mode='markers', marker_color='navy', marker=dict(size=15))) fig.update_layout(template='simple_white', title=plot_title, width=800, height=600, font_size=16, showlegend=False) fig.update_yaxes(title='log reads (MC1)', tick0=0, dtick=2) fig.update_xaxes(title='log reads (MC2)', tick0=0, dtick=2) out = Path('{}/{}_{}.pdf'.format('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/Fish_MC_plots', 'mc_vs_mc_rand', name)) fig.write_image(out) def PCR_replicates(): taxonomic_level_1 = 'Species' fish_mc = pd.read_excel('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/Raw_tables/Fish_MC_composition.xlsx') # collect number of species tables = sorted(glob.glob('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/TaXon_tables/4_adjusted/*.xlsx')) jaccard_scores = {} for table in tables: table = Path(table) table_name = str(table.stem).split('_')[1] df = pd.read_excel(table).fillna('') df = strip_metadata(df) sample_names = df.columns.tolist()[10:] all_species = set([i for i in df[taxonomic_level_1].values.tolist() if i != '']) MC_samples = [i for i in sample_names if 'MC_' in i] # MC_samples = [i for i in sample_names if 'MC-' in i] matrix = [] for s1 in MC_samples: species_s1 = set([i[0] for i in df[['Species', s1]].values.tolist() if i[0] != '' and i[1] != 0]) pa_s1 = [1 if i in species_s1 else 0 for i in all_species ] scores = [] for s2 in MC_samples: species_s2 = [i[0] for i in df[['Species', s2]].values.tolist() if i[0] != '' and i[1] != 0] pa_s2 = [1 if i in species_s2 else 0 for i in all_species ] score = round(jaccard_score(pa_s1, pa_s2),2) scores.append(score) matrix.append(scores) upper_tria = [list(i) for i in np.triu(matrix, k=1)] upper_tria_lst = [item for sublist in upper_tria for item in sublist] mean_values = round(mean([(i) for i in upper_tria_lst if i != 0]),2) min_values, max_values = min([(i) for i in upper_tria_lst if i != 0]), max([(i) for i in upper_tria_lst if i != 0]) jaccard_scores[table_name] = [mean_values, min_values, max_values] def number_of_species_per_taxon(): taxonomic_level_1 = 'Species' taxonomic_level_1b = 'ID' taxonomic_level_2 = 'Family' tables = sorted(glob.glob('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/TaXon_tables/4_adjusted/*.xlsx')) fish_mc = pd.read_excel('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/Raw_tables/Fish_MC_composition.xlsx') n_species_mock = {fam:fish_mc['Family'].values.tolist().count(fam) for fam in sorted(set(fish_mc['Family'].values.tolist()))} all_families = [] for table in tables: table = Path(table) df = pd.read_excel(table).fillna('') df = strip_metadata(df) families = [i for i in df[taxonomic_level_2] if i != ''] all_families.extend(families) all_families = sorted(list(set(all_families)))[::-1] # collect number of species species_dict = {} rel_dict = {} ratio_dict = {} for table in tables: table = Path(table) table_name = str(table.stem).split('_')[1] df = pd.read_excel(table).fillna('') sample_names = df.columns.tolist()[10:] all_species = {i[1]:i[0] for i in df[[taxonomic_level_2, taxonomic_level_1]].values.tolist() if i[1] != ''} all_OTUs = {i[1]:i[0] for i in df[[taxonomic_level_2, taxonomic_level_1b]].values.tolist() if i[1] != ''} species_list = [] text_list = [] ratio_list = [] for fam in all_families: n_species = list(all_species.values()).count(fam) n_OTUs = list(all_OTUs.values()).count(fam) if fam in n_species_mock.keys(): n_MC = n_species_mock[fam] else: n_MC = 0 species_list.append(n_species) text = '{}|{}|{}'.format(n_MC, n_species, n_OTUs) text_list.append(text) if n_species != 0: ratio = round(n_OTUs / n_species * 100,1) else: ratio = 0 ratio_list.append(ratio) species_dict[table_name] = species_list rel_dict[table_name] = text_list ratio_dict[table_name] = ratio_list df1 = pd.DataFrame(species_dict, index=all_families) df2 = pd.DataFrame(rel_dict, index=all_families) df3 = pd.DataFrame(ratio_dict, index=all_families) df3.to_excel('/Users/tillmacher/Desktop/Paper/Fish_MC_paper/Upload_MBMG_methods/table_3.xlsx') ### part 2 ## addiationally count the congruent species species_mc = {i[1]:i[0] for i in fish_mc[[taxonomic_level_2, taxonomic_level_1]].values.tolist() if i[1] != ''} false_pos_neg_dict = {} names_dict = {} for table in tables: table = Path(table) table_name = str(table.stem).split('_')[1] df = pd.read_excel(table).fillna('') sample_names = df.columns.tolist()[10:] all_species = {i[1]:i[0] for i in df[[taxonomic_level_2, taxonomic_level_1]].values.tolist() if i[1] != ''} heatmap_values = [] text_values = [] for fam in all_families: sample_species = set([i[0] for i in all_species.items() if i[1] == fam]) mc_species = set([i[0] for i in species_mc.items() if i[1] == fam]) n_species_mc = len(mc_species) false_positives = len(sample_species - mc_species) false_negatives = len(mc_species - sample_species) text = '{}/{}'.format(false_positives, false_negatives) heatmap_values.append(false_positives+false_negatives) text_values.append(text) false_pos_neg_dict[table_name] = heatmap_values names_dict[table_name] = text_values df3 = pd.DataFrame(false_pos_neg_dict, index=all_families) df4 = pd.DataFrame(names_dict, index=all_families) ## PLOT A fig = go.Figure(data=go.Heatmap( z = df1.values.tolist(), x = df1.columns.tolist(), y = df1.index.tolist(), text = df2.values.tolist(), texttemplate="%{text}", colorscale = 'Blues', textfont={"size":20}, hoverongaps = False)) fig.update_layout(height=1000, width=1000, template='simple_white', showlegend=False, font_size=20) out = Path('{}/{}_per_{}_true_pos.pdf'.format('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/Fish_MC_plots', taxonomic_level_1, taxonomic_level_2)) fig.write_image(out) ## PLOT B fig = go.Figure(data=go.Heatmap( z = df3.values.tolist(), x = df3.columns.tolist(), y = df3.index.tolist(), text = df4.values.tolist(), texttemplate="%{text}", colorscale = 'Reds', textfont={"size":20}, hoverongaps = False)) fig.update_layout(height=1000, width=1000, template='simple_white', showlegend=False, font_size=20) out = Path('{}/{}_per_{}_false_pos_neg.pdf'.format('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/Fish_MC_plots', taxonomic_level_1, taxonomic_level_2)) fig.write_image(out) ## sum of false results primers = df4.columns.tolist() for primer in primers: sum_false_taxa = [] for r in df4[primer].values.tolist(): sum_false_taxa.append(sum([int(i) for i in r.split("/")])) print(primer, sum(sum_false_taxa)) def false_negatives(): fn = ["Cottus gobio", "Rutilus pigus", "Gymnocephalus schraetser", "Zingel zingel", "Umbra krameri", "Lampetra fluviatilis", "Blicca bjoerkna", "Leuciscus idus"] tables = sorted(glob.glob('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/TaXon_tables/*.xlsx')) false_neg = {} for table in tables: table = Path(table) table_name = str(table.stem).split('_')[1] df = pd.read_excel(table).fillna('') df = strip_metadata(df) genus_only = set([i[0] for i in df[['Genus', 'Species']].values.tolist() if i[1]=='' and i[0]!='']) for species in fn: genus = species.split()[0] if genus in genus_only: if species in false_neg.keys(): false_neg[species] = false_neg[species] + ['Yes'] else: false_neg[species] = ['Yes'] else: if species in false_neg.keys(): false_neg[species] = false_neg[species] + ['No'] else: false_neg[species] = ['No'] def all_taxa_analysis_plot(): taxonomic_level_1 = 'Species' taxonomic_level_2 = 'Family' tables = sorted(glob.glob('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/TaXon_tables/4_adjusted/*.xlsx')) fish_mc = pd.read_excel('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/Raw_tables/Fish_MC_composition.xlsx') fish_mc_species = sorted(set(fish_mc['Species'].values.tolist())) all_species = [] + fish_mc_species species_dict = {} table_names = [] for table in tables: species = [i for i in pd.read_excel(table).fillna('')['Species'].values.tolist() if i != ''] table = Path(table) table_name = str(table.stem).split('_')[1] table_names.append(table_name) all_species.extend(species) species_dict[table_name] = species names = ['{}*'.format(i) if i not in fish_mc_species else i for i in sorted(set(all_species))] all_species = sorted(set(all_species)) read_abundance_dict = {} for table in tables: table = Path(table) name = str(table.stem).split('_')[1] df = pd.read_excel(table).fillna('') df = strip_metadata(df) sample_names = df.columns.tolist()[10:] total_reads = sum([sum(i) for i in df[sample_names].values.tolist()]) for species in all_species: species_occurrence = [] for sample in sample_names: sample_reads = sum(df[sample].values.tolist()) n_reads = sum([i[0] for i in df[[sample, 'Species']].values.tolist() if i[1] == species and i[0] != 0]) n_reads = n_reads / sample_reads * 100 species_occurrence.append(n_reads) # mean_value = round(np.mean([np.log(i) if i != 0 else 0 for i in species_occurrence]),2) mean_value = round(np.mean([species_occurrence]),3) if species in fish_mc_species: name = species else: name = '00_{}'.format(species) if name in read_abundance_dict.keys(): read_abundance_dict[name] = read_abundance_dict[name] + [mean_value] else: read_abundance_dict[name] = [mean_value] df2 = pd.DataFrame(read_abundance_dict).transpose() df2.columns = table_names df2.to_excel('/Users/tillmacher/Desktop/Paper/Fish_MC_paper/Upload_MBMG_methods/table_21.xlsx') def species_to_OTU_ration(): taxonomic_level_1 = 'Species' taxonomic_level_2 = 'Family' tables = sorted(glob.glob('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/TaXon_tables/4_adjusted/*.xlsx')) fish_mc = pd.read_excel('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/Raw_tables/Fish_MC_composition.xlsx') fish_mc_species = sorted(set(fish_mc['Species'].values.tolist())) all_species = [] + fish_mc_species family_dict = {} species_dict = {} table_names = [] for table in tables: res = [i for i in pd.read_excel(table).fillna('')[['Family','Species']].values.tolist() if i[1] != ''] species = [i[1] for i in res] family = [i[0] for i in res] table = Path(table) table_name = str(table.stem).split('_')[1] table_names.append(table_name) all_species.extend(species) species_dict[table_name] = species for s,f in zip(species, family): family_dict[s] = f names = ['{}*'.format(i) if i not in fish_mc_species else i for i in sorted(set(all_species))] all_species = sorted(set(all_species)) species_OTU_list = [] for species, family in family_dict.items(): ratio_list = [family, species] for table in tables: table = Path(table) name = str(table.stem).split('_')[1] df = pd.read_excel(table).fillna('') df = strip_metadata(df) sample_names = df.columns.tolist()[10:] OTU_list = df['Species'].values.tolist() n_OTUs = OTU_list.count(species) if n_OTUs != 0: ratio = round(n_OTUs / 1 * 100, 1) if ratio == 100: ratio_list.append(0) else: ratio_list.append(ratio) else: ratio_list.append('-') m = [i for i in ratio_list[2:] if i != 0 and i != '-'] if m != []: mean_value = round(np.mean([i for i in m if i != 0 and i != '-']), 1) ratio_list.append(mean_value) else: ratio_list.append(0) species_OTU_list.append(ratio_list) df3 = pd.DataFrame(species_OTU_list, columns=['Family', 'Species'] + table_names + ['Mean oversplitting rate (%)']) df3.sort_values(by=['Family', 'Species', 'Mean oversplitting rate (%)'], inplace=True) df3.to_excel('/Users/tillmacher/Desktop/Paper/Fish_MC_paper/Upload_MBMG_methods/table_3.xlsx', index=False) def ambigous_taxa(): taxonomic_level_1 = 'Species' taxonomic_level_2 = 'Family' tables = sorted(glob.glob('/Users/tillmacher/Desktop/Paper/Fish_MC_paper/Upload_MBMG_methods/4_supplementary_material/Supplementary_Material_4/*')) table_names = [] flags = ['F1 - Dominant taxon', 'F2 - Two species, one genus', 'F3 - Multiple species of one genus', 'F4 - Multiple genera'] flags_dict = {} all_flags = [] for table in tables: table = Path(table) table_name = str(table.stem).split('_')[1] table_names.append(table_name) df = pd.read_excel(table).fillna('') all_flags.extend([i[0] for i in df[['Species', 'Flag']].values.tolist() if i[0] != '' and i[1] != '']) ## collect species and ambigous taxa supported_species = [i for i in df[['Species', 'Flag']].values.tolist() if i[0] != '' and i[1] == ''] flags_list = [i for i in df['Flag'].values.tolist() if i != ''] n_supported_species = len(supported_species) n_flagged_species = len(flags_list) n_total_species = n_supported_species + n_flagged_species ## count OTUs without a flag res = len(supported_species) / n_total_species * 100 if 'Supported species' in flags_dict.keys(): flags_dict['Supported species'] = flags_dict['Supported species'] + [res] else: flags_dict['Supported species'] = [res] ## count OTUs with a flag for flag in flags: print(flag, flags_list.count(flag)) res = flags_list.count(flag) / n_total_species * 100 if flag in flags_dict.keys(): flags_dict[flag] = flags_dict[flag] + [res] else: flags_dict[flag] = [res] fig = go.Figure() colors = ['#ffffff', '#c0d2e2', '#85a8c8', '#4379aa', '#003b6f'] for i, (flag, values) in enumerate(flags_dict.items()): color = colors[i] y_values = values x_values = table_names fig.add_trace(go.Bar(x=x_values, y=y_values, marker_color=color, name=flag, marker_line_color='black', marker_line_width=.2)) fig.update_layout(template='simple_white', barmode='stack', width=800, height=600, font_size=22, showlegend=False) fig.update_yaxes(title='species-level OTUs (%)', dtick=10) out = Path('{}/{}.pdf'.format('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/Fish_MC_plots', 'flags')) fig.write_image(out) out = Path('{}/{}.html'.format('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/Fish_MC_plots', 'flags')) fig.write_html(out) d = dict(pd.Series(all_flags).value_counts()) df = pd.DataFrame([[i,j] for i,j in d.items()], columns=['Species', '# occurrences']) out = Path('{}/{}.xlsx'.format('/Users/tillmacher/Desktop/TTT_projects/Projects/Fish_MC_V2/Fish_MC_plots', 'flags')) df.to_excel(out, index=False) def primer_RDB_test(): def extract_midori_headers(marker, fasta, all_mc_taxa_): ## write results to list output = [] ## read database fasta record_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) for taxon in tqdm(all_mc_taxa_, desc=marker): ## calculate number of hits per species hits = [i for i in record_dict.keys() if taxon in i] n_hits_species = len(hits) ## write species hits to fasta species_fasta = Path('/Users/tillmacher/Desktop/Paper/Fish_MC_paper/midori_db_check/species_fastas/{}_{}.fasta'.format(taxon, marker)) f = open(species_fasta, 'w') for hit in hits: f.write(">" + str(record_dict[hit].id) + "\n") f.write(str(record_dict[hit].seq + "\n")) f.close() output.append(n_hits_species) return output ## perform analysis for all markers/databases databases = sorted(glob.glob('/Users/tillmacher/Desktop/Paper/Fish_MC_paper/midori_db_check/midori_2_references/*.fasta')) markers = ['COI', 'lrRNA', 'srRNA'] ## collect all species all_mc_taxa = pd.read_excel('/Users/tillmacher/Desktop/Paper/Fish_MC_paper/midori_db_check/mc_composition.xlsx')['Species'].values.tolist() all_mc_taxa_ = [i.replace(' ', '_') for i in all_mc_taxa] ## create dataframe from dict out_df = pd.DataFrame(all_mc_taxa_, columns=['Species']) ## add results to dataframe for fasta, marker in zip(databases, markers): out_df[marker] = extract_midori_headers(marker, fasta, all_mc_taxa_) primer_df = pd.read_excel('/Users/tillmacher/Desktop/Paper/Fish_MC_paper/midori_db_check/mc_composition.xlsx', sheet_name='Primers') all_primer_combos = [i for i in primer_df.values.tolist()] ## calculate the number of sequence with adapters -> CUTADAPT for primer in all_primer_combos: ## load the required information marker = primer[0] name = primer[1] forward = primer[2] reverse = primer[3] output = [] ## add all possible combinations of primers a = [forward + '...' + reverse] b = [str(Seq(forward).reverse_complement()) + '...' + reverse] c = [str(Seq(forward).reverse_complement()) + '...' + str(Seq(reverse).reverse_complement())] d = [forward + '...' + str(Seq(reverse).reverse_complement())] ## loop through all taxa for taxon in tqdm(all_mc_taxa_, desc=name): res = [] for primer_pair in a + b + c + d: ## load the species fasta for the respective marker species_fasta = Path('/Users/tillmacher/Desktop/Paper/Fish_MC_paper/midori_db_check/species_fastas/{}_{}.fasta'.format(taxon, marker)) ## perform cutadapt for each primer and store all sequence with primers f = subprocess.run(['cutadapt', '-a', primer_pair, '-e', '3', '--discard-untrimmed', species_fasta], capture_output=True) try: res.extend(i for i in f.stdout.decode('utf-8').replace("\t", " ").split("\n") if i.startswith('>')) except: pass ## append the highest number of sequences ## remove duplicates output.append(len(set(res))) ## add to dataframe pos = out_df.columns.tolist().index(marker) out_df.insert(pos+1, name, output) ## convert to relative abundance output_rel = [round(j[1]/j[0]*100, 2) if j[0] != 0 and j[1] != 0 else 0 for j in zip([i for i in out_df[marker]], output)] ## add to dataframe name2 = '{} (%)'.format(name) out_df.insert(pos+2, name2, output_rel) ## write excel sheet out_df.to_excel('/Users/tillmacher/Desktop/Paper/Fish_MC_paper/midori_db_check/RDB_primer_check_e3_v255.xlsx', index=False)