A text file with annotated BASH and R code detailing their use in the preparation and analysis of the data. BASH and R Code Utilized for Analysis ###### BASH CODE ######## ## Filter non-mito contigs from assemblies ## for dir in F* do wd=$(pwd) cd "$dir"/cox1_readmapping/metaspades_assembly mkdir mito-contigs grep mitochon contigs.fasta.vs.nt.cul5.1e5.megablast.out | cut -f1 | sort | uniq >mito-contigs/mitocontigids.list cd "$wd" done for dir in F* do wd=$(pwd) cd "$dir"/cox1_readmapping/metaspades_assembly/mito-contigs while read -r line do grep -w -A 1 "$line" ../contigs.fasta donemito_contigs.fasta cd "$wd" done ## Filter mito contigs that are smaller than smallest possible genes (excluding ATP8) ## for dir in F* do while read -r line do length=$(awk -F"_" '{print $4}' <(echo "$line")) nodeid=$(awk -F"_" '{print $2}' <(echo "$line")) if [[ "$length" -gt 330 ]] then grep -w -A 1 "$line" "$dir"/*mito_contigs.fasta >"$dir"/NODE_"$nodeid".fasta fi done<"$dir"/nodeids.list done ## Formatting for Community Analysis ## for dir in F* do cd "$dir" cat NODE* >"$dir"_lengthfilt_mito_contigs.fasta cd .. done ## BLAST filtered contigs against custom mitogene database ## for dir in F* do cd "$dir" blastn -query "$dir"_lengthfilt_mito_contigs.fasta -subject ~/Projects/custom_mitogene_db/refseq_aug31_2022/custom_db_working_version.fasta -outfmt "6 qseqid sseqid pident qcovs length qstart qend sstart send mismatch bitscore evalue" -out "$dir"_blastn_results.tsv cd .. done ## Extract community info from blast tables ## # 92 % ID & 32% QCOV 100bp length cutoffs (non-annotated data) for dir in F* do cd "$dir" echo "$dir" awk -F"\t" '$3 >= 92 {print $0}' "$dir"_blastn_results.tsv | awk -F"\t" '$4 >= 32 {print $0}' | awk -F"\t" '$5 >= 100 {print $0}' | cut -f2 | awk -F"|" '{print $(NF-1), $NF}' | sort | uniq >output.txt echo "$dir" >sampleid.txt #cat sampleid.txt output.txt >"$dir"_sampletaxa_98ID32QCOV.list mv output.txt "$dir"_sampletaxa_92ID32QCOV100length.list cd .. done ## Extract top blast hits from blast tables ## for dir in F* do wd=$(pwd) cd "$dir"/cox1_readmapping/metaspades_assembly/mito-contigs/ ids=$(cut -f1 blastn_results.tsv | uniq) while read -r line do grep -w "$line" blastn_results.tsv | head -n1 >tophits_blastn_results.tsv done< <(echo "$ids") cd "$wd" done # 92 % ID & 32% QCOV 100bp length cutoffs (non-annotated data) for dir in F* do wd=$(pwd) cd "$dir"/cox1_readmapping/metaspades_assembly/mito-contigs/ echo "$dir" awk -F"\t" '$3 >= 91 {print $0}' tophits_blastn_results.tsv | awk -F"\t" '$4 >= 32 {print $0}' | awk -F"\t" '$5 >= 100 {print $0}' >output.txt #echo "$dir" >sampleid.txt #cat sampleid.txt output.txt >"$dir"_sampletaxa_98ID32QCOV.list nodeids=$(cut -f1 output.txt | sort | uniq) while read -r line do grep -w "$line" output.txt done < <(echo "$nodeids") >92ID32QCOV100length_blastn_results_tophits.tsv rm output.txt cd "$wd" done # build comm presence/absence table using selected cutoffs and only for COX1 for dir in F* do wd=$(pwd) cd "$dir"/cox1_readmapping/metaspades_assembly/mito-contigs/ echo "$dir" output=$(cat 92ID32QCOV100length_blastn_results_tophits.tsv) idslist=$(cut -f1 92ID32QCOV100length_blastn_results_tophits.tsv) out=$(while read -r line do grep -w "$line" <(echo "$output") | cut -f2 done< <(echo "$idslist")) echo "$out" >92ID32QCOV100length.list outputsize=$(wc -l <(echo "$out") | cut -f1 -d" ") if [ $outputsize -gt 0 ] then for i in $( seq 1 $outputsize ) do echo "1" done for i in {1..50} do echo "0" done else echo "0" fi >detectoutput.txt echo "$dir" >dir.txt echo "detected" >fill.txt echo "$out" paste dir.txt fill.txt >header.txt paste 92ID32QCOV100length.list detectoutput.txt >tmp1 cat header.txt tmp1 >92ID32QCOV100length.tsv cd "$wd" done paste F*/cox1_readmapping/metaspades_assembly/mito-contigs/92ID32QCOV100length.tsv >outputtable_92ID32QCOV100length.tsv # extract cox1 contig tophits passing 92% id and 32% qcov cutoffs for dir in F* do cd "$dir" echo "$dir" output=$(awk -F"\t" '$3 >= 92 {print $0}' "$dir"_blastn_results.tsv | awk -F"\t" '$4 >= 32 {print $0}' | awk -F"\t" '$5 >= 100 {print $0}') idslist=$(cut -f1 <(echo "$output") | uniq) out=$(while read -r line do grep COX1 <(echo "$output") | grep -w "$line" | head -n1 done< <(echo "$idslist")) (echo "$dir" echo "$out") >"$dir"_92ID32QCOV_blastn_results.tsv cd .. done # Alternative table builder for file in F* do sampleid=$(awk -F"_" '{print $1}' <(echo $file)) echo "$sampleid" while read -r line do result=$(grep "$line" "$file") if [ -z "$result" ] then echo "$line 0" else echo "$line 1" fi done"$sampleid"_presenceabsence_tmp done # Rename sequence IDs for dir in F* do wd=$(pwd) cd "$dir" sed -i "s/NODE/'$dir'_NODE/" comm_cox1_seqs.fasta sed -i "s/'//g" comm_cox1_seqs.fasta seqidtotaxaid=$(while read -r line do grep -w "$line" "$dir"_annotated_blast_results_derep_bitscoresorted.tsv | cut -f1,2 | awk -F"|" '{print $1,$(NF-1),$NF}' | sed 's/ /_/g' doneagrifield_cox1_seqs.fasta # select top open reading frames for each node using bitscore column from blastn results for dir in F* do idlist=$(cut -f1 "$dir"/orfipy_*/blastn_results.tsv | awk -F"_" '{print $1,$2,$3}' | sed 's/ /_/g' | sort | uniq) while read -r line do grep "$line" "$dir"/orfipy_"$dir"*/blastn_results.tsv | sort -k12 | cut -f1 done< <(echo "$idlist") >"$dir"/orfipy_"$dir"_lengthfilt_mito_contigs.fasta_out/bitscore_determined_orfs_nodeids.list done # select top open reading frames for each node using bitscore column from blastn results for dir in F* do idlist=$(cut -f1 "$dir"/orfipy_*/blastn_results.tsv | awk -F"_" '{print $1,$2,$3}' | sed 's/ /_/g' | sort | uniq) while read -r line do grep "$line" "$dir"/orfipy_"$dir"*/blastn_results.tsv | sort -k12 done< <(echo "$idlist") >"$dir"/orfipy_"$dir"_lengthfilt_mito_contigs.fasta_out/bitscore_determined_orfs_blastn_results.tsv done # Updated sequence ID renaming script # includes node ID, accession ID, position matched to reference, taxonomy for dir in F* do wd=$(pwd) cd "$dir" # Pull fasta of ORFs for each node for each sample cp ../../"$dir"/orfipy_"$dir"_lengthfilt_mito_contigs.fasta_out/output-dna.fasta ./ # format files sed -i 's/\[/\(/g' output-dna.fasta sed -i 's/\[/\(/g' "$dir"_annotated_blast_results_derep_bitscoresorted.tsv # extract relevant ORFs for each node grep --no-group-separator -A 1 -w -f cox1seqids.list output-dna.fasta >cox1_seqs.fasta # loop through each relevant sequence, extract, and modify sequence header while read -r line do currentseqid=$(grep -w "$line" cox1_seqs.fasta | sed 's/>//') blastnout=$(grep -w "$line" "$dir"_annotated_blast_results_derep_bitscoresorted.tsv) node=$(cut -f1 -d" " <(echo "$currentseqid") | awk -F"_" '{print $1,$2}' | sed 's/ /_/g') length=$(cut -f4 -d" " <(echo "$currentseqid") | cut -f2 -d":") refid=$(cut -f2 <(echo "$blastnout") | awk -F"|" '{print $1}') reftax=$(cut -f2 <(echo "$blastnout") | awk -F"|" '{print $(NF-1),$NF}' | sed 's/ /_/g') refstartpos=$(cut -f10 <(echo "$blastnout")) refstoppos=$(cut -f11 <(echo "$blastnout")) replacewith=$(echo "$dir" "$node" "length_""$length" "$refid" "$reftax" "start""$refstartpos" "stop""$refstoppos" | sed 's/ /_/g' | sed 's/>//') echo "$currentseqid" echo "$replacewith" sed -ier "s/$currentseqid/$replacewith/g" cox1_seqs.fasta done0,1,sum) # Chao1 Chao1 <- apply(t(otutable), 1, chao1) # combine into a single data table alternative.alphadiv<-as.data.frame(cbind(Shannon,Simpson,Invert.Simpson,Richness,Chao1)) sub.map<-subset(map, rownames(map) %in% rownames(alternative.alphadiv)) rownames(alternative.alphadiv) == rownames(sub.map) # Loop by Sample to Summarize Richness sampleids<-rownames(alternative.alphadiv) for ( i in 1:length(sampleids)) { sampleid<-sampleids[i] sampledat<-subset(alternative.alphadiv, rownames(alternative.alphadiv) == sampleid) print(sampledat$Richness) # Change column name for desired variable } # Plots map<-read.csv("compare_map.csv", header = T, row.names = 1) map<-map[order(rownames(map)),] map$Metavariable <- factor(map$Metavariable, levels = c("1.0 – 25 cm.Annotated","1.0 – 25 cm.Morpho","1.25 – 50 cm.Annotated","1.25 – 50 cm.Morpho","1.50 -75 cm.Annotated","1.50 -75 cm.Morpho","1.75-100 cm.Annotated","1.75-100 cm.Morpho","2.0 – 25 cm.Annotated","2.0 – 25 cm.Morpho","2.25 – 50 cm.Annotated","2.25 – 50 cm.Morpho","2.50 -75 cm.Annotated","2.50 -75 cm.Morpho","2.75-100 cm.Annotated","2.75-100 cm.Morpho","4.0 – 25 cm.Annotated","4.0 – 25 cm.Morpho","4.25 – 50 cm.Annotated","4.25 – 50 cm.Morpho","4.50 -75 cm.Annotated","4.50 -75 cm.Morpho","4.75-100 cm.Annotated","4.75-100 cm.Morpho","3.0 – 25 cm.Annotated","3.0 – 25 cm.Morpho","3.25 – 50 cm.Annotated","3.25 – 50 cm.Morpho","3.50 -75 cm.Annotated","3.50 -75 cm.Morpho","3.75-100 cm.Annotated","3.75-100 cm.Morpho")) myColors<-c("lightgray","lightblue") pdf("richness_trophic.pdf") boxplot(map$Treefiltered.Richness.Trophic ~ map$Metavariable, col = myColors, las = 2, cex.axis = 0.3, cex.lab = 0.1) dev.off() pdf("richness_plantparasites.pdf") boxplot(map$Treefiltered.Richness.PP ~ map$Metavariable, las = 2, col = myColors, cex.axis = 0.3, cex.lab = 0.1) dev.off() sub.map<-subset(map, map$Method == "Annotated") sub.map<-droplevels(sub.map) sampleorder<-c("F111-025","F111-2550","F111-5075","F111-75100","F112-025","F112-2550","F112-5075","F112-75100","F121-025","F121-2550","F121-5075","F121-75100","F122-025","F122-2550","F122-5075","F122-75100","F211-025","F211-2550","F211-5075","F211-75100","F212-025","F212-2550","F212-5075","F212-75100","F221-025","F221-2550","F221-5075","F221-75100","F222-025","F222-2550","F222-5075","F222-75100","F411-025","F411-2550","F411-5075","F411-75100","F412-025","F412-2550","F412-5075","F412-75100","F421-025","F421-2550","F421-5075","F421-75100","F422-025","F422-2550","F422-5075","F422-75100","F311-025","F311-2550","F311-5075","F311-75100","F312-025","F312-2550","F312-5075","F312-75100","F321-025","F321-2550","F321-5075","F321-75100","F322-025","F322-2550","F322-5075","F322-75100") sub.map2<-sub.map[sampleorder,] sub.map2$Metavariable <- factor(sub.map2$Metavariable, levels = c("1.0 – 25 cm.Annotated","1.25 – 50 cm.Annotated","1.50 -75 cm.Annotated","1.75-100 cm.Annotated","2.0 – 25 cm.Annotated","2.25 – 50 cm.Annotated","2.50 -75 cm.Annotated","2.75-100 cm.Annotated","4.0 – 25 cm.Annotated","4.25 – 50 cm.Annotated","4.50 -75 cm.Annotated","4.75-100 cm.Annotated","3.0 – 25 cm.Annotated","3.25 – 50 cm.Annotated","3.50 -75 cm.Annotated","3.75-100 cm.Annotated")) myColors = c("black","#646060","#817F7F","#B5B0B0","#F20808","#EE5353","#F58B8B","#F9C3C3","#065F09","#358438","#6CAC6E","#CEF6D0","#07185A","#31407C","#7482BD","#C4CFF9") pdf("richness_annotatedcox1_v3.pdf") boxplot(sub.map2$Richness ~ as.factor(sub.map2$Metavariable), col = myColors, las = 2, cex.axis = 0.3, cex.lab = 0.1) dev.off() # Alphadiv Stats # Generalized linear model summary(glm(alternative.alphadiv$Richness ~ Treatment, data = plot.map)) #### Metadata Analysis #### # Metadata Correlations sub.sub.map<-sub.map[,4:ncol(sub.map)] pval <- psych::corr.test(sub.sub.map, adjust="none")$p corrplot(cor(sub.sub.map, use = "complete.obs", method = "pearson"), type="lower", p.mat=pval, tl.pos="n") corrplot(cor(sub.sub.map, use = "complete.obs", method = "pearson"), type="lower", p.mat=pval, sig.level=0.05, add=T, tl.pos="ld", cl.pos="n", cl.cex = 0.5, pch.cex = 0.5, tl.cex = 0.5, number.cex = 0.1)