Testing primers against PR2
1 Goal
Display primer set matches against the PR2 database (latest version 4.12.0)
Matches are computed by an independent R script (PR2 Primers pr2_match.R) and stored in an rda file which is read by this Rmd file.
In this version # of mismatches is computed as well as position of mismatches
1.1 To do
- Check why there are some negative amplicon (probably some matches ahead - Put a condition that amplicon should be >)
2 Initialize
Load the variables common to the different scripts and the necessary libraries
# Libraries for bioinfo ----------------------------------------------------
library("Biostrings")
# Libraries tidyr ---------------------------------------------------------
library("ggplot2")
theme_set(theme_light())
library("dplyr")
library("readxl")
library(openxlsx)
library("tibble")
library("readr")
library("purrr")
library("forcats")
library("lubridate")
library("stringr")
library("rio")
library(patchwork)
# Libraries dvutils and pr2database -------------------------------------------------------
if(any(grepl("package:dvutils", search()))) detach("package:dvutils", unload=TRUE)
library("dvutils")
# if(any(grepl("package:pr2database", search()))) detach("package:pr2database", unload=TRUE)
# library("pr2database")
# Options for knitting the report -------------
library(knitr)
library(rmdformats)
library(kableExtra)
knitr::opts_chunk$set(fig.width=8,
fig.height=6,
eval=TRUE,
cache=TRUE,
# For next line see https://stackoverflow.com/questions/39417003/long-vectors-not-supported-yet-error-in-rmd-but-not-in-r-script
cache.lazy = FALSE,
echo=TRUE,
prompt=FALSE,
tidy=TRUE,
comment=NA,
message=FALSE,
warning=FALSE)
opts_knit$set(width=90)
options(max.print="500")
options(knitr.kable.NA = '')
3 Constants
file_param <- "../R_scripts/param_pr2_primers.R"
source(file_param)
cat(readChar(file_param, 1e+05))
max_mismatch = 2
gene_selected = "18S_rRNA" # Can be 18S_rRNA or 16_rRNA
rda_file_label = "_all" # This label is added at the end of the rda file name
gene_regions = c("V4", "V9")
kingdoms = c("Eukaryota", "Bacteria", "Archaea")
# These primers should not be included (semi nested PCR or amplify very few sequences)
primer_sets_excluded = c(73, 74, 37, 67, 91, 68)
sequence_length_min = 1350
sequence_length_min_V9 = 1650
sequence_length_min_V4 = 1200
4 Read primer file
Only keep the 18S primers with V region
primers <- import("../data/primers.rds")
primer_sets <- import("../data/primer_sets.rds")
if (gene_selected == "18S_rRNA") {
primers <- primers %>% filter(gene == "18S rRNA", !is.na(doi))
primer_sets <- primer_sets %>% filter(gene == "18S rRNA", !is.na(doi), !str_detect(gene_region, "ITS|cloning|full"))
gene_regions_all = unique(primer_sets$gene_region)
print(gene_regions_all)
} else if (gene_selected == "16S_rRNA") {
primer_sets <- primer_sets %>% filter(specificity == "plastid" | (gene == "18S rRNA" & specificity == "universal"))
}
[1] "37F-41F" "Helix 37 and 37f" "V1-V2" "V2" "V2-V3" "V3" "V3-V4"
[8] "V4" "V4-V5" "V5" "V6" "V6-V8" "V7" "V7-V8"
[15] "V7-V9" "V8-V9" "V9"
primer_sets <- primer_sets %>% filter(!(primer_set_id %in% primer_sets_excluded)) %>% mutate(specific = ifelse(is.na(specificity), "general", "specific")) %>%
relocate(specific, .before = specificity)
# file_name = str_c('output/Table_primers_',gene_selected, '.xlsx') export(primer_sets, file = file_name, firstActiveRow = 2)
n_primers <- list()
n_primers[["general"]] <- nrow(filter(primer_sets, specific == "general"))
n_primers[["specific"]] <- nrow(filter(primer_sets, specific == "specific"))
n_primers
$general
[1] 44
$specific
[1] 25
knitr::kable(select(primer_sets, primer_set_name, primer_set_id, fwd_name, rev_name, gene_region, amplicon_size)) %>% kableExtra::kable_styling()
primer_set_name | primer_set_id | fwd_name | rev_name | gene_region | amplicon_size |
---|---|---|---|---|---|
Li_2020 | 109 | s14F3 | 17 | 37F-41F | |
Clerissi_2018 | 81 | 18SV1V2F | 18SV1V2R | V1-V2 | 341 |
Bik_2012 | 80 | F04 | R22 | V1-V2 | 402 |
Rachik_2018 | 110 | 18S-82F | Euk-516r | V2-V3 | 481 |
Lentendu_2014c | 72 | Kineto_80 | Kineto_651 | V2-V3 | 667 |
Santoferrara_2018 | 59 | 152+ | 528- | V2-V3 | 352 |
Lentendu_2014b | 69 | Chryso_240 | Chryso_651 | V2-V3 | 545 |
Sisson_2018 | 84 | SAR_V3_F | SAR_V3_R | V3 | 182 |
Lentendu_2014a | 62 | Cer2F | Cer1R | V3-V4 | 588 |
Michaud_2019a | 87 | Oxy_18S-F | Oxy_18S-R | V3-V4 | 444 |
Kataoka_2017 | 108 | 545F | 1119R | V4 | 561 |
Zhan | 40 | Uni18SF | Uni18SR | V4 | 461 |
Brate1 | 13 | 3NDf | V4_euk_R1 | V4 | 465 |
Brate2 | 14 | 3NDf | V4_euk_R2 | V4 | 458 |
Geisen | 12 | 3NDf | 1132rmod | V4 | 599 |
Lambert | 34 | 515FY | NSR951 | V4 | 391 |
Needham | 33 | 515F Univ | 926R | V4 | 589 |
Parfrey | 18 | 515F | 1119r | V4 | 598 |
Hugerth_2 | 4 | 563f | 1132r | V4 | 587 |
Piwosz_2019 | 102 | TAReuk454FWD1 | HaptoR1 | V4 | 279 |
Piredda_V4 | 16 | TAReuk454FWD1 | V4 18S Next.Rev | V4 | 417 |
Bass | 7 | V4_1f | TAReukREV3 | V4 | 417 |
Stoeck_V4_2 | 8 | TAReuk454FWD1 | TAReukREV3 | V4 | 417 |
Stoeck_V4_1 | 36 | TAReuk454FWD1 | V4RB | V4 | 417 |
Choi_2020 | 104 | TAReuk454FWD1_Choi | TAReukREV3_Choi | V4 | 417 |
Belevich_2017 | 86 | EuF-V4 | picoR2 | V4 | 425 |
Bradley_2016_V4 | 90 | TAReuk454FWD1 | V4r | V4 | 417 |
Vannini | 19 | Claudia Vannini (F) | Claudia Vannini (R) | V4 | 439 |
Hadziavdic_1 | 1 | F-566 | R-1200 | V4 | 635 |
Moreno | 15 | EUKAF | EUKAR | V4 | 410 |
Emberg_2018 | 103 | E572F | 897r | V4 | 344 |
Kim_V4_2016 | 22 | 528F | Nex_18S_0964_R | V4 | 419 |
Comeau | 17 | E572F | E1009R | V4 | 438 |
Mangot | 25 | NSF563 | NSR951 | V4 | 380 |
Xu_2020 | 99 | A-528F | B-706R | V4 | 344 |
Hadziavdic_2 | 2 | A-528F | R-952 | V4 | 379 |
Egge | 39 | A-528F | PRYM01+7 | V4 | 396 |
Fadev_2018 | 98 | A-528F | V4RB | V4 | 408 |
Hugerth_6 | 83 | A-528F | 1132r | V4 | 577 |
Kilias_2013 | 100 | A-528F | 1055R | V4 | 709 |
Bass_2020 | 119 | 574*f | UNonMet DB | V4 | 566 |
Hugerth_1 | 3 | 574*f | 1132r | V4 | 576 |
Hugerth_5 | 77 | 574f | 1132r | V4 | 576 |
Venter | 23 | 590F | 1300R | V4 | 720 |
Zimmerman | 21 | D512for | D978rev | V4 | 437 |
Harder | 41 | Cerc479F | Cerc750R | V4 | 283 |
Simon | 24 | EK-565F-NGS | EUK1134-R | V4 | 527 |
Fiore-Donno_2018c | 65 | S616F_Cerco | S947R_Cerco | V4 | 348 |
Fiore-Donno_2018d | 66 | S616F_Eocer | S947R_Cerco | V4 | 348 |
Fiore-Donno_2018a | 63 | S616F_Cerco | S963R_Cerco | V4 | 367 |
Fiore-Donno_2018b | 64 | S616F_Eocer | S963R_Cerco | V4 | 367 |
Hugerth_3 | 5 | 616f | 1132r | V4 | 534 |
Hugerth_4 | 6 | 616*f | 1132r | V4 | 534 |
UNonMet | 35 | EUK581-F | EUK1134-R | V4 | |
Stokes_2002 | 96 | QPX-F | QPX-R2 | V4 | |
Hu_2016 | 101 | 574*f | 1132R modified | V4-V5 | 576 |
Michaud_2019b | 88 | Par_18S-F | Par_18S-R | V5 | 562 |
Moro | 38 | ChloroF | ChloroR | V5 | 494 |
Stokes_2002 | 97 | LABY-A | LABY-Y | V6 | 434 |
Wilkins | 32 | 926wF | 1392-R | V6-V8 | 513 |
Huo_2020 | 107 | 960F | NSR1438 | V7 | 264 |
Lundgreen_2019 | 76 | F-1183 | R-1443 | V7 | 261 |
Chemidlin_2011 | 92 | FF390 | FR1 | V7-V8 | 349 |
Kim_V9_2016 | 106 | Nex_18S_1434_F | Nex_18S_1757_R | V8-V9 | 371 |
Bradley_2016_V9 | 89 | V8f | 1510R | V8-V9 | 372 |
Amaral_1 | 28 | 1380F | 1510R | V9 | 176 |
Amaral_2 | 29 | 1389F | 1510R | V9 | 167 |
Piredda_V9 | 31 | 1388F | 1510R | V9 | 167 |
Stoeck_V9 | 27 | 1391F | EukB | V9 | 169 |
5 Computing the matches
This part is done by an R script script_primers_pr2_match.R
that is executed in batch mode on Roscoff server.
6 Read the file for all primer sets and filter
This file is created by the R script script_update files.R
# For testing
pr2_match_final <- readRDS(file = str_c("output/pr2_match_", gene_selected, "_mismatches_", max_mismatch, ".rds"))
print(str_c("Before filtration: ", nrow(pr2_match_final)))
[1] "Before filtration: 11053938"
# Replace the gene_region and primer_set_label_short since they can be changed in the database
primer_sets_labels <- primer_sets %>% mutate(primer_set_label_short = str_c(str_sub(gene_region, 1, 2), sprintf("%03d", primer_set_id), str_sub(str_replace_na(specificity,
replacement = ""), 1, 3), sep = " "), primer_set_label_long = str_c(gene_region, primer_set_name, "-", str_replace_na(specificity, "general"),
sep = " ")) %>% # Remove the last underscore if left by itself
mutate(primer_set_label_short = str_replace(primer_set_label_short, " $", "")) %>% select(primer_set_id, primer_set_label_short, primer_set_label_long,
gene_region, specific, specificity)
pr2_match_final <- pr2_match_final %>% left_join(primer_sets_labels) %>% # Remove sequences for which the introns have been removed
filter(!str_detect(pr2_accession, "_UC")) %>% # Remove sequence that are shorter
filter((str_detect(gene_region, "V4") & sequence_length >= sequence_length_min_V4) | (str_detect(gene_region, "V9") & sequence_length >= sequence_length_min_V9) |
(!str_detect(gene_region, "V4|V9") & sequence_length >= sequence_length_min)) %>% mutate(mismatch_number = fwd_mismatch_number + rev_mismatch_number) %>%
# Only keep the selected primers
filter(primer_set_id %in% primer_sets$primer_set_id)
print(str_c("After filtration: ", nrow(pr2_match_final)))
[1] "After filtration: 6807694"
7 Read the summarized tables
pr2_match_summary_primer_set <- readRDS(file = str_c("output/pr2_match_", gene_selected, "_mismatches_", max_mismatch, "_summary.rds"))
pr2_match_summary_primer_set_sg <- readRDS(file = str_c("output/pr2_match_", gene_selected, "_mismatches_", max_mismatch, "_summary_sg.rds"))
pr2_match_summary_primer_set_class <- readRDS(file = str_c("output/pr2_match_", gene_selected, "_mismatches_", max_mismatch, "_summary_class.rds"))
# Long form for Number of sequences
# This dataframe is used to re-order the bars correctly
pct_category_order <- data.frame(pct_category = c("ampli_pct", "fwd_pct", "rev_pct"), pct_category_order = c(1, 3, 2))
pr2_match_summary_primer_set_long <- pr2_match_summary_primer_set %>% tidyr::pivot_longer(names_to = "pct_category", values_to = "pct_seq", cols = fwd_pct:ampli_pct) %>%
left_join(pct_category_order)
8 Tables
8.1 Path for tables
8.2 Primers
table_primers <- primers %>% mutate(specific = ifelse(is.na(specificity), "general", "specific")) %>% filter(!str_detect(specificity, "blocking") |
is.na(specificity)) %>% relocate(specific, .before = specificity) %>% group_by(direction, specific) %>% count() %>% arrange(-n) %>% tidyr::pivot_wider(names_from = specific,
values_from = n)
8.3 Primers sets
8.4 Primer sets statistics
table_primer_sets_ampli <- pr2_match_summary_primer_set %>% filter(kingdom == "Eukaryota") %>% group_by(specific) %>% summarise(across(.cols = all_of(c("fwd_pct",
"rev_pct", "ampli_pct", "ampli_size_mean")), .fns = list(min = min, mean = mean, max = max))) %>% tidyr::pivot_longer(cols = matches("min|max|mean"),
names_to = "parameter", values_to = "values") %>% tidyr::pivot_wider(names_from = specific, values_from = values)
8.5 Save to Excel file
8.6 Export to Latex tables
8.6.1 Define function to format the tables
- Not used
- “X” = “\\cellcolor{gray}”
sanitize.italics <- function(str) {
str_replace_all(str, c("_" = " ",
# "ital\\{" = "\\\\textit{",
"°" = "\\\\degree",
"±"="$\\pm$"))
}
# See: https://www.rdocumentation.org/packages/xtable/versions/1.8-4/topics/print.xtable
latex_table <- function(table, caption, label, file, align,
scalebox = getOption("xtable.scalebox", NULL),
digits = 0,
add.to.row = getOption("xtable.add.to.row", NULL),
include.colnames = TRUE,
tabular.environment = getOption("xtable.tabular.environment", "tabular") ) {
table <- xtable::xtable(table,
label=label,
caption=caption,
align = align,
digits=digits)
print(table, scalebox = scalebox,
caption.placement = "top",
include.rownames = FALSE,
add.to.row = add.to.row,
include.colnames = include.colnames,
tabular.environment = tabular.environment,
file=file,
sanitize.text.function = sanitize.italics)
}
8.6.3 Table - Primers
latex_table(table = table_primers, caption = "Type of primers listed in the pr2-primers database. General primers target all eukaryotes and specific primers only certain taxonomic groups.",
label = "tab:primers", file = full_file_name("table_primers.tex"), align = c("l", "c", "c", "c"), scalebox = 1, digits = 0)
8.6.4 Table - Primer sets
8.6.5 Table - Primer sets stats
8.7 Supplementary Table - Primers
table <- primers %>%
select(primer_id, name, sequence, direction, start_yeast, specificity, doi)
add.to.row <- list()
add.to.row$pos <- list(0, 0, 0, 0, 0)
add.to.row$command <- c(
# Header for first page
"id & Name & Sequence & Direction & Start (yeast) & Specificity & DOI \\\\\n",
"\\endfirsthead \n \\hline \n",
# Header for other pages
"id & Name & Sequence & Direction & Start (yeast) & Specificity & DOI \\\\\n",
"\\hline \n \\endhead \n",
# Footers
"\\hline \n \\endfoot \n \\endlastfoot \n"
)
latex_table(table = table,
caption = "List of primers in the pr2-primers database ordered by start position relative to the sequence of the yeast \textit{Saccharomyces cerevisiae} (FU970071).",
label = "tabsup:primers",
file = full_file_name("table_sup_primers.tex"),
align = c("l","l","l", "l","l","l","l", "l" ),
digits= 0,
add.to.row = add.to.row,
include.colnames = FALSE,
tabular.environment = "longtable"
)
8.8 Supplementary Table - Primer sets
table <- primer_sets %>%
select(primer_set_id, primer_set_name, fwd_name, rev_name, gene_region, specificity, doi) %>%
arrange(primer_set_id)
add.to.row <- list()
add.to.row$pos <- list(0, 0, 0, 0, 0)
add.to.row$command <- c(
# Header for first page
"id & Name & Primer fwd & Primer rev & Region & Specificity & DOI \\\\\n",
"\\endfirsthead \n \\hline \n",
# Header for other pages
"id & Name & Primer fwd & Primer rev & Region & Specificity & DOI \\\\\n",
"\\hline \n \\endhead \n",
# Footers
"\\hline \n \\endfoot \n \\endlastfoot \n"
)
latex_table(table = table,
caption = "List of primer sets in the pr2-primers database.",
label = "tabsup:primer_sets",
file = full_file_name("table_sup_primer_sets.tex"),
align = c("l","l","l", "l","l","l","l", "l" ),
digits= 0,
add.to.row = add.to.row,
include.colnames = FALSE,
tabular.environment = "longtable"
)
9 Graphics
9.2 Amplicon length
9.2.1 Scatter
ggplot(filter(pr2_match_final, kingdom == "Eukaryota"), aes(x = primer_set_label_short, y = ampli_size, group = primer_set_id,
color = as.factor(mismatch_number))) + geom_point(size = 3) + theme(axis.text.x = element_text(angle = 45,
hjust = 1)) + labs(x = "Primer set") + scale_color_viridis_d() + facet_wrap(vars(specific), nrow = 2,
scales = "free_x")
9.2.2 Average size
ggplot(filter(pr2_match_final, kingdom == "Eukaryota"), aes(x = primer_set_label_short, y = ampli_size, group = primer_set_id,
fill = as.factor(mismatch_number))) + geom_boxplot(outlier.alpha = 0.3) + theme(axis.text.x = element_text(angle = 45,
hjust = 1)) + labs(x = "Primer set") + scale_fill_viridis_d() + facet_wrap(vars(specific), nrow = 2,
scales = "free_x")
9.3 Example with sets V4 and V9
9.3.1 Plot an example of amplicon distribution (Fig. 3)
for (one_primer_set in c(8, 27)) {
if (one_primer_set == 8) {
xmin = 400
xmax = 450
xmax2 = 2000
} else {
xmin = 140
xmax = 190
xmax2 = 1000
}
pr2_match_final_one <- pr2_match_final %>% filter(primer_set_id == one_primer_set) %>% filter(kingdom ==
"Eukaryota") %>% filter(!(supergroup %in% c("Apusozoa", "Eukaryota_X", "Protoalveoalata")))
primer_set_label_long = pr2_match_final_one$primer_set_label_long[1]
g <- ggplot(pr2_match_final_one, aes(x = ampli_size)) + geom_density(fill = "blue", alpha = 0.9) + xlab("Amplicon size") +
ggtitle(str_c("Primer set - ", primer_set_label_long)) + xlim(xmin, xmax)
print(g)
g <- ggplot(pr2_match_final_one, aes(x = ampli_size, fill = supergroup)) + geom_density(alpha = 0.9) +
theme_bw() + # theme(legend.position = 'none') +
guides(fill = guide_legend(nrow = 3)) + theme(legend.position = "top", legend.box = "horizontal") + scale_fill_viridis_d(option = "inferno") +
labs(x = "Amplicon size (bp)", y = "Density", fill = "Supergroup") + # ggtitle(str_c('Primer set - ', primer_set_label_long)) +
xlim(xmin, xmax)
print(g)
fig3[[str_c(one_primer_set, " size_distri")]] <- g
g <- ggplot(pr2_match_final_one, aes(x = supergroup, y = ampli_size)) + geom_boxplot(outlier.alpha = 0.3) +
theme_bw() + coord_flip() + # theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
xlab("Supergroup") + ylab("Amplicon size (bp)") + ylim(0, xmax2) + geom_hline(yintercept = c(450, 550),
linetype = c(2, 3))
print(g)
fig3[[str_c(one_primer_set, " size")]] <- g
g <- ggplot(pr2_match_final_one, aes(x = supergroup, y = ampli_size)) + geom_violin() + coord_flip() +
# theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
xlab("Supergroup") + ylab("Amplicon size (bp)")
print(g)
}
9.3.2 Plot an example of percent amplification
for (one_primer_set in c(8, 27)) {
pr2_match_summary_filtered <- pr2_match_summary_primer_set_sg %>% filter((n_seq > 20) & (primer_set_id == one_primer_set) & (kingdom == "Eukaryota") &
!(supergroup %in% c("Apusozoa", "Eukaryota_X")))
g <- ggplot(pr2_match_summary_filtered) + geom_col(data = pr2_match_summary_filtered, aes(x = str_c(supergroup, " - n = ", n_seq), y = ampli_pct,
fill = supergroup), position = "dodge") + theme_bw() + # theme(legend.position = 'none') +
guides(fill = guide_legend(nrow = 2)) + theme(legend.position = "top", legend.box = "horizontal") + scale_fill_viridis_d(option = "inferno") +
coord_flip() + labs(x = "% of sequences amplified", y = "", legend = "Supergroup") + scale_y_continuous(minor_breaks = seq(0, 100, by = 10),
breaks = seq(0, 100, by = 20), limits = c(0, 100))
# ggtitle (str_c('Set - ', one_primer_set, ' - % amplified per Supergroup') )
print(g)
fig3[[str_c(one_primer_set, " pct")]] <- g
g <- ggplot(filter(pr2_match_summary_filtered, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(supergroup, " - n = ", n_seq), y = ampli_size_mean),
colour = "black") + coord_flip() + geom_errorbar(aes(x = str_c(supergroup, " - n = ", n_seq), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean -
ampli_size_sd)) + xlab("Supergroup") + ylab("Amplicon size (bp)") # +
# scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) + ggtitle (str_c('Set - ', one_primer_set, ' - Amplicon sizes') ) +
# geom_hline(yintercept = c(450,550) , linetype= 2)
print(g)
}
9.3.3 Plot an example of mismatches
for (one_primer_set in c(8, 27)) {
pr2_mismatches <- pr2_match_summary_primer_set_sg %>% tidyr::pivot_longer(names_to = "mismatch_number", values_to = "mismatch_pct", cols = contains("ampli_mismatch"),
names_prefix = "ampli_mismatch_") %>% select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number, 1, 1)) %>%
mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>% filter((n_seq > 20) & (primer_set_id == one_primer_set) & (kingdom ==
"Eukaryota") & !(supergroup %in% c("Apusozoa", "Eukaryota_X")))
g <- ggplot(pr2_mismatches, aes(x = str_c(supergroup, " - n = ", n_seq), y = mismatch_pct, group = primer_set_id, fill = as.factor(mismatch_number))) +
geom_col() + theme_bw() + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + labs(x = "Supergroup", y = "% of sequences with mismatches",
fill = "Mismatches") + scale_fill_viridis_d(direction = -1, alpha = 0.85) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top",
legend.box = "horizontal") + coord_flip()
print(g)
fig3[[str_c(one_primer_set, "mismatches", sep = " ")]] <- g
print(g)
}
9.3.4 Plot an example of mismatches positions
for (one_primer_set in c(8, 27)) {
df <- pr2_match_final %>% filter(primer_set_id == one_primer_set) %>% filter(kingdom == "Eukaryota") %>% filter(!(supergroup %in% c("Apusozoa",
"Eukaryota_X")))
g <- ggplot() + geom_histogram(data = filter(df, !is.na(fwd_mismatch_primer_position_5prime)), aes(x = fwd_mismatch_primer_position_5prime, fill = supergroup),
binwidth = 1, stat = "bin", alpha = 1) + scale_fill_viridis_d(option = "inferno") + theme_bw() + scale_x_continuous(minor_breaks = 0:30) +
labs(y = "Density", x = "Position of mismatches from 5' end", title = "fwd")
print(g)
fig3[[str_c(one_primer_set, "mismatches positions fwd", sep = " ")]] <- g
g <- ggplot() + geom_histogram(data = filter(df, !is.na(rev_mismatch_primer_position_5prime)), aes(x = rev_mismatch_primer_position_5prime, fill = supergroup),
binwidth = 1, stat = "bin", alpha = 1) + scale_fill_viridis_d(option = "inferno") + theme_bw() + scale_x_continuous(minor_breaks = 0:30) +
labs(y = "Density", x = "Position of mismatches from 5' end", title = "rev")
print(g)
fig3[[str_c(one_primer_set, "mismatches positions rev", sep = " ")]] <- g
}
9.4 All Eukaryotes
Comments
- Percent of sequences amplified
- Logically, the % of seq amplified is always < min(% of sequences matching forwar, % of sequences matching reverse)
- In general it is the reverse primer that causes problems.
- Some primer sets do not amplify any sequence (11, 19, 20, 21)
- For V9, primer set 30 reverse primer is in the ITS region which is not present in the PR2 sequences, so no amplification.
- Amplicon sizes
- Only 8 V4 primer sets are suitable for Illumina 2x250 (max amplicon size = 450 bp)
- This is if we consider the mean… If we consider the variation around the mean then, only 3 suitable for 2x250
- Five more V4 primer sets are suitable for Illumina 3x250 (max amplicon size = 550 bp)
9.4.1 Plot only V4 and V9 - For Fig. 2
for (one_kingdom in kingdoms) {
for (one_region in gene_regions) {
pr2_match_summary_primer_set_region_long <- pr2_match_summary_primer_set_long %>% filter(kingdom == one_kingdom) %>% filter(gene_region ==
one_region) %>% # Remove the group specific primers
filter(specific == "general") %>% filter(pct_category %in% c("ampli_pct", "fwd_pct", "rev_pct"))
pr2_match_summary_primer_set_region <- pr2_match_summary_primer_set %>% filter(kingdom == one_kingdom) %>% filter(gene_region == one_region) %>%
# Remove the group specific primers)
filter(specific == "general")
pr2_match_region <- pr2_match_final %>% filter(kingdom == one_kingdom) %>% filter(gene_region == one_region) %>% # Remove the group specific primers
filter(specific == "general")
# % Ampli
g <- ggplot(pr2_match_summary_primer_set_region_long) + geom_col(aes(x = str_replace(str_c(sprintf("%03d", primer_set_id), primer_set_label_long,
sep = " - "), " - general", ""), y = pct_seq, fill = fct_reorder(pct_category, pct_category_order)), width = 0.7, position = "dodge") +
theme_bw() + xlab("Primer set") + ylab("% of sequences amplified") + scale_fill_manual(name = "", values = c(ampli_pct = "black", fwd_pct = "blue",
rev_pct = "red"), labels = c("Amplicons", "rev", "fwd")) + ggtitle(str_c(one_kingdom, one_region, "Percentage of sequences recovered",
sep = " - ")) + theme(axis.text.x = element_text(angle = 0, hjust = 1)) + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) +
ylim(0, 100) + coord_flip() + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal")
print(g)
# Size
fig1[[str_c(one_kingdom, one_region, "pct", sep = " ")]] <- g
g <- ggplot(filter(pr2_match_summary_primer_set_region, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(sprintf("%03d", primer_set_id),
primer_set_label_long, sep = " - "), y = ampli_size_mean), colour = "black") + geom_errorbar(aes(x = str_c(sprintf("%03d", primer_set_id),
primer_set_label_long, sep = " - "), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) + theme_bw() + xlab("Primer set") +
ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
ggtitle(str_c(one_kingdom, one_region, "Amplicon size", sep = " - ")) + geom_hline(yintercept = c(450, 550), linetype = c(2, 3)) + ylim(0,
850) + theme(axis.text.x = element_text(angle = 0, hjust = 1)) + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + coord_flip()
print(g)
fig1[[str_c(one_kingdom, one_region, "size", sep = " ")]] <- g
# Mismatches
pr2_mismatches <- pr2_match_summary_primer_set %>% filter(kingdom == one_kingdom) %>% tidyr::pivot_longer(names_to = "mismatch_number", values_to = "mismatch_pct",
cols = contains("ampli_mismatch"), names_prefix = "ampli_mismatch_") %>% select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number,
1, 1)) %>% mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>% filter(gene_region == one_region) %>% # Remove the group specific primers
filter(specific == "general")
g <- ggplot(pr2_mismatches, aes(x = str_c(sprintf("%03d", primer_set_id), primer_set_label_long, sep = " - "), y = mismatch_pct, group = primer_set_id,
fill = as.factor(mismatch_number))) + geom_col() + theme_bw() + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + labs(x = "Primer set",
y = "% of sequences with mismatches", fill = "Mismatches", title = str_c(one_kingdom, one_region, sep = " - ")) + scale_fill_viridis_d(direction = -1,
alpha = 0.85) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal") + coord_flip()
print(g)
fig1[[str_c(one_kingdom, one_region, "mismatches", sep = " ")]] <- g
}
}
9.4.2 Plot all with panel general and specific - For Fig S1
for (one_kingdom in kingdoms) {
for (specific_one in c("general", "specific")) {
df <- pr2_match_summary_primer_set_long %>% filter(kingdom == one_kingdom) %>% filter(pct_category %in% c("ampli_pct", "fwd_pct", "rev_pct")) %>%
filter(specific == specific_one)
g <- ggplot(df) + geom_col(aes(x = str_replace(str_c(primer_set_label_long, sprintf("%03d", primer_set_id), sep = " - "), " - general", ""),
y = pct_seq, fill = fct_reorder(pct_category, pct_category_order)), width = 0.7, position = "dodge") + theme_bw() + xlab("Primer set") +
ylab("% of sequences amplified") + scale_fill_manual(name = "", values = c(ampli_pct = "black", fwd_pct = "blue", rev_pct = "red"), labels = c("Amplicons",
"rev", "fwd")) + ggtitle(str_c(one_kingdom, specific_one, "Percentage of sequences recovered", sep = " - ")) + theme(axis.text.y = element_text(angle = 0,
hjust = 0, vjust = 0)) + ylim(0, 100) + coord_flip() + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal")
print(g)
fig1[[str_c(one_kingdom, specific_one, "pct", sep = " ")]] <- g
df <- pr2_match_summary_primer_set %>% filter(kingdom == one_kingdom) %>% filter(!is.nan(ampli_size_mean)) %>% filter(specific == specific_one)
g <- ggplot(df) + geom_point(aes(x = str_c(primer_set_label_long, sprintf("%03d", primer_set_id), sep = " - "), y = ampli_size_mean), colour = "black") +
geom_errorbar(aes(x = str_c(primer_set_label_long, sprintf("%03d", primer_set_id), sep = " - "), ymax = ampli_size_mean + ampli_size_sd,
ymin = ampli_size_mean - ampli_size_sd)) + theme_bw() + xlab("Primer set") + ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
ggtitle(str_c(one_kingdom, specific_one, " - Amplicon size", sep = " - ")) + geom_hline(yintercept = c(450, 550), linetype = c(2, 3)) + ylim(0,
900) + theme(axis.text.y = element_text(angle = 0, hjust = 0)) + coord_flip()
print(g)
fig1[[str_c(one_kingdom, specific_one, "size", sep = " ")]] <- g
}
}
9.5 Mismatches - for Fig.S1
9.5.1 Number of mismatches
for (one_kingdom in kingdoms) {
for (specific_one in c("general", "specific")) {
pr2_mismatches <- pr2_match_summary_primer_set %>% tidyr::pivot_longer(names_to = "mismatch_number", values_to = "mismatch_pct", cols = contains("ampli_mismatch"),
names_prefix = "ampli_mismatch_") %>% select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number, 1, 1)) %>%
mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>% filter(specific == specific_one) %>% filter(kingdom == one_kingdom)
g <- ggplot(pr2_mismatches, aes(x = str_c(primer_set_label_long, sprintf("%03d", primer_set_id), sep = " - "), y = mismatch_pct, group = primer_set_id,
fill = as.factor(mismatch_number))) + geom_col() + theme_bw() + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + labs(x = "Primer set",
y = "% of sequences with mismatches", fill = "Mismatches", title = str_c(one_kingdom, specific_one, sep = " - ")) + scale_fill_viridis_d(direction = -1,
alpha = 0.85) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal") + coord_flip()
print(g)
fig1[[str_c(one_kingdom, specific_one, "mismatches", sep = " ")]] <- g
}
}
9.5.2 Position of mismatches (only for Eukaryota)
ggplot(filter(pr2_match_final, kingdom == "Eukaryota"), aes(x = primer_set_label_long)) + geom_boxplot(aes(y = fwd_mismatch_primer_position_5prime),
outlier.alpha = 0.3, color = "blue") + coord_flip() + theme_bw() + theme(axis.text.y = element_text(angle = 0, vjust = 0, hjust = 0)) + scale_y_continuous(minor_breaks = 0:25,
breaks = seq(0, 25, by = 5), limits = c(0, 25)) + labs(x = "Primer set", y = "Position of mismatches", title = "Primer fwd") + facet_wrap(vars(specific),
nrow = 2, scales = "free_y")
ggplot(filter(pr2_match_final, kingdom == "Eukaryota"), aes(x = primer_set_label_long)) + geom_boxplot(aes(y = rev_mismatch_primer_position_5prime),
outlier.alpha = 0.3, color = "red") + coord_flip() + theme_bw() + theme(axis.text.y = element_text(angle = 0, vjust = 0, hjust = 0)) + scale_y_continuous(minor_breaks = 0:25,
breaks = seq(0, 25, by = 5), limits = c(0, 25)) + labs(x = "Primer set", y = "Position of mismatches", title = "Primer rev") + facet_wrap(vars(specific),
nrow = 2, scales = "free_y")
ggplot(filter(pr2_match_final, !is.na(rev_mismatch_primer_position_5prime), kingdom == "Eukaryota")) + geom_histogram(aes(x = rev_mismatch_primer_position_5prime),
color = "red", binwidth = 1, stat = "density") + theme_bw() + scale_x_continuous(minor_breaks = 0:30, breaks = seq(0, 30, by = 5), limits = c(0,
30)) + labs(y = "Primer set", x = "Position of mismatches", title = "Primer rev") + facet_wrap(vars(primer_set_label_long), ncol = ncol)
## By supergroup (only for Eukaryota) - For Fig. S2 and S5
Comments:
- Excavata have a very different patterns from the rest of the group. They are not amplified by quite a few primer sets. They have also bigger amplicons
- Some groups exhibit higher variability in amplicon size (e.g Chlorophyta)
9.5.3 % of Ampli and Amplicon size
fig_supergroup <- list()
ncol = 8
for (specific_one in c("general", "specific")) {
pr2_match_summary_primer_set_sg_region <- pr2_match_summary_primer_set_sg %>% filter(kingdom == "Eukaryota") %>%
filter(specific == specific_one) %>% filter((n_seq > 20)) %>% filter(!(supergroup %in% c("Apusozoa",
"Eukaryota_X")))
g <- ggplot(pr2_match_summary_primer_set_sg_region) + geom_col(aes(x = supergroup, y = ampli_pct, fill = supergroup),
position = "dodge") + theme_bw() + coord_flip() + ylab("% of sequences amplified") + xlab("Supergroup") +
ggtitle(str_c(specific_one, " primers")) + scale_fill_viridis_d(option = "inferno") + ylim(0, 100) +
facet_wrap(~primer_set_label_short, scales = "fixed", ncol = ncol) + guides(fill = guide_legend(nrow = 1)) +
theme(legend.position = "top", legend.box = "horizontal") + theme(legend.position = "none")
print(g)
list_label <- str_c("pct", specific_one, sep = " ")
fig_supergroup[[list_label]] <- g
g <- ggplot(filter(pr2_match_summary_primer_set_sg_region, !is.nan(ampli_size_mean))) + geom_point(aes(x = supergroup,
y = ampli_size_mean), colour = "black") + theme_bw() + coord_flip() + geom_errorbar(aes(x = supergroup,
ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) + xlab("Supergroup") +
ylab("Amplicon size (bp)") + ggtitle(str_c(specific_one, " primers")) + geom_hline(yintercept = c(450,
550), linetype = 2) + facet_wrap(~primer_set_label_short, scales = "fixed", ncol = ncol)
print(g)
list_label <- str_c("size", specific_one, sep = " ")
fig_supergroup[[list_label]] <- g
}
9.5.4 Number of mismatches (Only Eukaryotes) - Fig. S2 and S5
ncol = 8
pr2_mismatches_sg <- pr2_match_summary_primer_set_sg %>% filter(kingdom == "Eukaryota") %>% tidyr::pivot_longer(names_to = "mismatch_number", values_to = "mismatch_pct",
cols = contains("ampli_mismatch"), names_prefix = "ampli_mismatch_") %>% select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number,
1, 1)) %>% mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>% filter((n_seq > 20)) %>% filter(!(supergroup %in% c("Apusozoa",
"Eukaryota_X")))
for (specific_one in c("general", "specific")) {
pr2_mismatches_sg_one <- pr2_mismatches_sg %>% filter(specific == specific_one)
g <- ggplot(pr2_mismatches_sg_one, aes(x = supergroup, y = mismatch_pct, fill = fct_rev(mismatch_number))) + geom_col() + theme_bw() + theme(axis.text.x = element_text(angle = 0,
hjust = 1)) + labs(x = "Group", y = "% of sequences with mismatches", title = str_c(specific_one, " primers"), fill = "Mismatches") + scale_fill_viridis_d(direction = 1,
alpha = 0.85) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal") + coord_flip() + facet_wrap(vars(primer_set_label_short),
ncol = ncol)
print(g)
fig_supergroup[[str_c("mismatches", specific_one, sep = " ")]] <- g
}
9.6 By class for autotrophs (Only EUkaryotes) - Fig. S6
fig_class <- list()
ncol = 8
for (specific_one in c("general", "specific")) {
pr2_match_summary_filtered_one <- pr2_match_summary_primer_set_class %>% filter(n_seq > 20) %>% filter(division %in%
c("Haptophyta", "Dinoflagellata", "Chlorophyta", "Ochrophyta", "Cryptophyta")) %>% filter(specific ==
specific_one)
g <- ggplot(pr2_match_summary_filtered_one) + geom_col(aes(x = str_c(str_trunc(division, 20, ellipsis = ""),
"-", class), y = ampli_pct, fill = division), position = "dodge") + theme_bw() + coord_flip() + ylab("% of sequences amplified") +
xlab("Class") + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + # scale_fill_viridis_d(option = 'plasma') +
scale_fill_brewer(palette = "Accent") + ylim(0, 100) + facet_wrap(vars(primer_set_label_short), scales = "fixed",
ncol = ncol) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal")
print(g)
list_label <- str_c("pct", specific_one, sep = " ")
fig_class[[list_label]] <- g
g <- ggplot(filter(pr2_match_summary_filtered_one, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(str_trunc(division,
3, ellipsis = ""), "-", class), y = ampli_size_mean), colour = "black") + geom_errorbar(aes(x = str_c(str_trunc(division,
3, ellipsis = ""), "-", class), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean -
ampli_size_sd)) + theme_bw() + coord_flip() + theme(axis.text.y = element_text(angle = 0, hjust = 0,
vjust = 0)) + xlab("Class") + ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) + ggtitle (str_c('Set -', one_primer_set, '
# - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively') ) +
geom_hline(yintercept = c(450, 550), linetype = 2) + facet_wrap(~primer_set_label_short, scales = "fixed",
ncol = ncol)
print(g)
list_label <- str_c("size", specific_one, sep = " ")
fig_class[[list_label]] <- g
}
10 Specific analyses
10.1 Specific et sets for Opisthokonta
- 35 UnNonMet
- 16 Piredda
- 17 Comeau
for (one_primer_set in c(16, 17, 35)) {
pr2_match_summary_filtered <- filter(pr2_match_summary_primer_set_class, (n_seq > 20) & (supergroup %in%
c("Opisthokonta")) & (primer_set_id == one_primer_set))
g <- ggplot(pr2_match_summary_filtered) + geom_col(data = pr2_match_summary_filtered, aes(x = str_c(division,
"-", class, " - n= ", n_seq), y = ampli_pct), fill = "grey", position = "dodge") + theme(axis.text.y = element_text(angle = 0,
hjust = 0, vjust = 0)) + theme_bw() + coord_flip() + ylab("% of sequences amplified") + xlab("Class") +
ggtitle(str_c("Set -", one_primer_set, " - % amplified per Class"))
print(g)
g <- ggplot(filter(pr2_match_summary_filtered, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(division,
"-", class, " - n= ", n_seq), y = ampli_size_mean), colour = "black") + geom_errorbar(aes(x = str_c(division,
"-", class, " - n= ", n_seq), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) +
theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + coord_flip() + xlab("Class") +
ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
ggtitle(str_c("Set -", one_primer_set, " - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively")) +
geom_hline(yintercept = c(450, 550), linetype = 2)
print(g)
}
11 Figures
11.1 Fig. 2, S3 and S4 - V4 and V9 - Amplification % and size
11.1.1 Eukaryota - Fig. 2
fig_1 <- (fig1[["Eukaryota V4 pct"]] + ggtitle("") + ylab("") + fig1[["Eukaryota V4 mismatches"]] + ggtitle("") + xlab("") + ylab("") + theme(axis.text.y = element_blank()) +
fig1[["Eukaryota V4 size"]] + ggtitle("") + xlab("") + ylab("") + theme(axis.text.y = element_blank()))/(fig1[["Eukaryota V9 pct"]] + ggtitle("") +
theme(legend.position = "none") + fig1[["Eukaryota V9 mismatches"]] + ggtitle("") + theme(legend.position = "none") + xlab("") + theme(axis.text.y = element_blank()) +
fig1[["Eukaryota V9 size"]] + ggtitle("") + xlab("") + theme(axis.text.y = element_blank())) + plot_layout(heights = c(20, 4))
fig_1
ggsave(plot = fig_1, filename = "figs/fig_pct_sizes_mismatches_V4_V9_euk.pdf", width = 17, height = 12, scale = 2, units = "cm", useDingbats = FALSE)
11.1.2 Bacteria - Fig. S3
fig_1_bact <- (fig1[["Bacteria V4 pct"]] + ggtitle("") + ylab("") + fig1[["Bacteria V4 mismatches"]] + ggtitle("") + xlab("") + ylab("") + theme(axis.text.y = element_blank()) +
fig1[["Bacteria V4 size"]] + ggtitle("") + xlab("") + ylab("") + theme(axis.text.y = element_blank()))
fig_1_bact
ggsave(plot = fig_1_bact, filename = "figs/fig_pct_sizes_mismatches_V4_bact.pdf", width = 17, height = 10, scale = 2, units = "cm", useDingbats = FALSE)
11.1.3 Archaea - Fig. S4
fig_1_arch <- (fig1[["Archaea V4 pct"]] + ggtitle("") + ylab("") + fig1[["Archaea V4 mismatches"]] + ggtitle("") + xlab("") + ylab("") + theme(axis.text.y = element_blank()) +
fig1[["Archaea V4 size"]] + ggtitle("") + xlab("") + ylab("") + theme(axis.text.y = element_blank()))
fig_1_arch
ggsave(plot = fig_1_arch, filename = "figs/fig_pct_sizes_mismatches_V4_arch.pdf", width = 17, height = 10, scale = 2, units = "cm", useDingbats = FALSE)
11.2 Fig. S1 - All - Amplification % and size and mismatches
legend_pct <- cowplot::get_legend( fig1[["Eukaryota general pct"]] +
# create some space to the left of the legend
theme(legend.box.margin = margin(0, 0, 0, 20))
)
legend_mismatches <- cowplot::get_legend( fig1[["Eukaryota general mismatches"]] +
# create some space to the left of the legend
theme(legend.box.margin = margin(0, 0, 0, 20))
)
fig_1 <-( fig1[["Eukaryota general pct"]] +ggtitle("")+ ylab("") + xlab("") +
fig1[["Eukaryota general mismatches"]] +ggtitle("") +
xlab("") + ylab("") +
theme(axis.text.y = element_blank()) +
fig1[["Eukaryota general size"]] +ggtitle("") + xlab("") + ylab("")+
theme(axis.text.y = element_blank())
)/(
fig1[["Eukaryota specific pct"]] +ggtitle("")+ theme(legend.position="none")+ xlab("")+
fig1[["Eukaryota specific mismatches"]] +ggtitle("")+ theme(legend.position="none") + xlab("")+
theme(axis.text.y = element_blank())+
fig1[["Eukaryota specific size"]] +ggtitle("") + xlab("")+
theme(axis.text.y = element_blank())
) +plot_layout(heights = c(45, 25))
fig_1
ggsave(plot= fig_1 , filename="figs/fig_pct_sizes_mismatches_all.pdf",
width = 19 , height = 17, scale=2, units="cm", useDingbats=FALSE)
11.3 Fig. 3 - Example of V4 and V9
legend_mismatches <- cowplot::get_legend( fig3[["8 mismatches"]] +
# create some space to the left of the legend
theme(legend.box.margin = margin(t = 20, r = 0, b = 20, l = 20))
)
legend_size_distri <- cowplot::get_legend( fig3[["4 size_distri"]] +
# create some space to the left of the legend
theme(legend.box.margin = margin(t = 20, r = 0, b = 20, l = 20))
)
fig_subV4 <- cowplot::plot_grid(fig3[["8 mismatches positions fwd"]] + theme(legend.position="none") + ggtitle("fwd")+ xlab(""),
fig3[["8 mismatches positions rev"]] + theme(legend.position="none") + ggtitle("rev"),
nrow = 2,
align = "v" )
fig_subV9 <- cowplot::plot_grid(fig3[["27 mismatches positions fwd"]] + theme(legend.position="none") + ggtitle("fwd")+ xlab(""),
fig3[["27 mismatches positions rev"]] + theme(legend.position="none") + ggtitle("rev"),
nrow = 2,
align = "v" )
fig <- cowplot::plot_grid( legend_mismatches,
legend_size_distri,
fig3[["8 mismatches"]] + theme(legend.position="none"),
fig_subV4,
fig3[["8 size_distri"]] + theme(legend.position="none"),
fig3[["8 size"]],
fig3[["27 mismatches"]]+ theme(legend.position="none"),
fig_subV9,
fig3[["27 size_distri"]]+ theme(legend.position="none"),
fig3[["27 size"]],
labels = c("","","A - Primer V4 #08","","","", "B - Primer V9 #27","","",""),
label_y = 1.05, hjust = -0.1, scale=0.95,
ncol = 2, nrow = 5, rel_heights = c(4,20,20,20,20),
align = "h" )
fig
ggsave(plot= fig , filename="figs/fig_examples_V4_V9.pdf",
width = 20 , height = 25, scale=2.5, units="cm", useDingbats=FALSE)
11.4 Fig. S2 and S5 - Supergroup analysis
row_height = 5
for (specific_one in c("general", "specific")) {
fig_3 <- cowplot::plot_grid(fig_supergroup[[str_c("mismatches", specific_one, sep = " ")]], NULL, fig_supergroup[[str_c("size", specific_one, sep = " ")]],
NULL, labels = c("A", "", "B", ""), ncol = 2, nrow = 2, align = "v", rel_widths = c(13, 0.2))
print(fig_3)
height <- row_height * (ceiling(n_primers[[specific_one]]/ncol))
ggsave(plot = fig_3, filename = str_c("figs/fig_supergroup_", specific_one, ".pdf"), width = 20, height = height, scale = 1.75, units = "cm", useDingbats = FALSE)
# ggsave(plot= fig_supergroup[[str_c('mismatches', specific_one, sep = ' ')]] , filename=str_c('figs/fig_supergroup_', specific_one, '_A.pdf'),
# width = 14 , height = height/2, scale=1.75, units='cm', useDingbats=FALSE) ggsave(plot= fig_supergroup[[str_c('size', specific_one, sep = ' ')]]
# , filename=str_c('figs/fig_supergroup_', specific_one, '_B.pdf'), width = 14 , height = height/2, scale=1.75, units='cm', useDingbats=FALSE)
}
11.5 Fig. S6 - Class analysis
row_height = 3.5
for (specific_one in c("general", "specific")) {
fig_4 <- cowplot::plot_grid(fig_class[[str_c("pct", specific_one, sep = " ")]], ncol = 1, nrow = 1, align = "v")
print(fig_4)
height <- row_height * (trunc(n_primers[[specific_one]]/ncol))
ggsave(plot = fig_4, filename = str_c("figs/fig_class_", specific_one, ".pdf"), width = 10, height = height, scale = 3, units = "cm", useDingbats = FALSE)
}
11.6 Fig 1 - Primer sets position
reorder_primer <- function(x, y) {
x + 0.01 * (y - x)
}
fig <- ggplot(filter(primer_sets, !is.na(fwd_start))) + geom_segment(aes(x = fwd_start, xend = rev_end, y = forcats::fct_reorder2(primer_set_name,
fwd_start, rev_end, reorder_primer), yend = forcats::fct_reorder2(primer_set_name, fwd_start, rev_end, reorder_primer), color = specific), size = 3) +
geom_text(aes(x = rev_end + 50, y = forcats::fct_reorder2(primer_set_name, fwd_start, rev_end, reorder_primer), label = str_c(primer_set_id, gene_region,
str_replace_all(primer_set_name, c(`_` = " ", V4 = "", V9 = "")), str_replace_na(specificity, ""), sep = " ")), size = 3, hjust = 0) + scale_color_manual(name = "",
values = c(general = "gray30", specific = "grey70")) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal",
panel.grid.major.y = element_blank(), axis.text.y = element_blank(), axis.line.y = element_blank(), axis.ticks.y = element_blank(), panel.border = element_blank()) +
labs(x = "Position on 18S rRNA", y = "", legend = "") + xlim(0, 2200) + coord_cartesian(clip = "off")
fig
ggsave(plot = fig, filename = str_c("figs/fig_primer_sets_position.pdf"), width = 10, height = 15, scale = 2, units = "cm", useDingbats = FALSE)