Testing primers against PR2

1 Goal

  • Display primer set matches against the PR2 database (latest version 4.12.0)

  • Matches are computed by an independent R script (PR2 Primers pr2_match.R) and stored in an rda file which is read by this Rmd file.

  • In this version # of mismatches is computed as well as position of mismatches

1.1 To do

  • Check why there are some negative amplicon (probably some matches ahead - Put a condition that amplicon should be >)

2 Initialize

Load the variables common to the different scripts and the necessary libraries

# Libraries for bioinfo ----------------------------------------------------

  library("Biostrings")

# Libraries tidyr ---------------------------------------------------------

  library("ggplot2")
  theme_set(theme_light())
  library("dplyr")   
  library("readxl")
  library(openxlsx)
  library("tibble")
  library("readr")
  library("purrr")
  library("forcats")
  library("lubridate")
  library("stringr")
  library("rio")
  
  library(patchwork)

# Libraries dvutils and pr2database -------------------------------------------------------
  if(any(grepl("package:dvutils", search()))) detach("package:dvutils", unload=TRUE)
  library("dvutils")

  # if(any(grepl("package:pr2database", search()))) detach("package:pr2database", unload=TRUE)
  # library("pr2database")

# Options for knitting the report -------------  
  
  library(knitr)
  library(rmdformats)
  library(kableExtra) 

  knitr::opts_chunk$set(fig.width=8, 
                        fig.height=6, 
                        eval=TRUE, 
                        cache=TRUE,
# For next line see https://stackoverflow.com/questions/39417003/long-vectors-not-supported-yet-error-in-rmd-but-not-in-r-script                       
                        cache.lazy = FALSE,
                        echo=TRUE,
                        prompt=FALSE,
                        tidy=TRUE,
                        comment=NA,
                        message=FALSE,
                        warning=FALSE)
  opts_knit$set(width=90)
  options(max.print="500")  
  options(knitr.kable.NA = '')

3 Constants

file_param <- "../R_scripts/param_pr2_primers.R"
source(file_param)
cat(readChar(file_param, 1e+05))
max_mismatch = 2
gene_selected = "18S_rRNA"  # Can be 18S_rRNA or 16_rRNA
rda_file_label = "_all"  # This label is added at the end of the rda file name
gene_regions = c("V4", "V9")
kingdoms = c("Eukaryota", "Bacteria", "Archaea")

# These primers should not be included (semi nested PCR or amplify very few sequences)
primer_sets_excluded = c(73, 74, 37, 67, 91, 68) 

sequence_length_min = 1350
sequence_length_min_V9 = 1650 
sequence_length_min_V4 = 1200 

4 Read primer file

Only keep the 18S primers with V region

primers <- import("../data/primers.rds")
primer_sets <- import("../data/primer_sets.rds")


if (gene_selected == "18S_rRNA") {
    
    primers <- primers %>% filter(gene == "18S rRNA", !is.na(doi))
    
    
    primer_sets <- primer_sets %>% filter(gene == "18S rRNA", !is.na(doi), !str_detect(gene_region, "ITS|cloning|full"))
    
    
    gene_regions_all = unique(primer_sets$gene_region)
    print(gene_regions_all)
    
} else if (gene_selected == "16S_rRNA") {
    primer_sets <- primer_sets %>% filter(specificity == "plastid" | (gene == "18S rRNA" & specificity == "universal"))
}
 [1] "37F-41F"          "Helix 37 and 37f" "V1-V2"            "V2"               "V2-V3"            "V3"               "V3-V4"           
 [8] "V4"               "V4-V5"            "V5"               "V6"               "V6-V8"            "V7"               "V7-V8"           
[15] "V7-V9"            "V8-V9"            "V9"              
primer_sets <- primer_sets %>% filter(!(primer_set_id %in% primer_sets_excluded)) %>% mutate(specific = ifelse(is.na(specificity), "general", "specific")) %>% 
    relocate(specific, .before = specificity)


# file_name = str_c('output/Table_primers_',gene_selected, '.xlsx') export(primer_sets, file = file_name, firstActiveRow = 2)

n_primers <- list()
n_primers[["general"]] <- nrow(filter(primer_sets, specific == "general"))
n_primers[["specific"]] <- nrow(filter(primer_sets, specific == "specific"))
n_primers
$general
[1] 44

$specific
[1] 25
knitr::kable(select(primer_sets, primer_set_name, primer_set_id, fwd_name, rev_name, gene_region, amplicon_size)) %>% kableExtra::kable_styling()
primer_set_name primer_set_id fwd_name rev_name gene_region amplicon_size
Li_2020 109 s14F3 17 37F-41F
Clerissi_2018 81 18SV1V2F 18SV1V2R V1-V2 341
Bik_2012 80 F04 R22 V1-V2 402
Rachik_2018 110 18S-82F Euk-516r V2-V3 481
Lentendu_2014c 72 Kineto_80 Kineto_651 V2-V3 667
Santoferrara_2018 59 152+ 528- V2-V3 352
Lentendu_2014b 69 Chryso_240 Chryso_651 V2-V3 545
Sisson_2018 84 SAR_V3_F SAR_V3_R V3 182
Lentendu_2014a 62 Cer2F Cer1R V3-V4 588
Michaud_2019a 87 Oxy_18S-F Oxy_18S-R V3-V4 444
Kataoka_2017 108 545F 1119R V4 561
Zhan 40 Uni18SF Uni18SR V4 461
Brate1 13 3NDf V4_euk_R1 V4 465
Brate2 14 3NDf V4_euk_R2 V4 458
Geisen 12 3NDf 1132rmod V4 599
Lambert 34 515FY NSR951 V4 391
Needham 33 515F Univ 926R V4 589
Parfrey 18 515F 1119r V4 598
Hugerth_2 4 563f 1132r V4 587
Piwosz_2019 102 TAReuk454FWD1 HaptoR1 V4 279
Piredda_V4 16 TAReuk454FWD1 V4 18S Next.Rev V4 417
Bass 7 V4_1f TAReukREV3 V4 417
Stoeck_V4_2 8 TAReuk454FWD1 TAReukREV3 V4 417
Stoeck_V4_1 36 TAReuk454FWD1 V4RB V4 417
Choi_2020 104 TAReuk454FWD1_Choi TAReukREV3_Choi V4 417
Belevich_2017 86 EuF-V4 picoR2 V4 425
Bradley_2016_V4 90 TAReuk454FWD1 V4r V4 417
Vannini 19 Claudia Vannini (F) Claudia Vannini (R) V4 439
Hadziavdic_1 1 F-566 R-1200 V4 635
Moreno 15 EUKAF EUKAR V4 410
Emberg_2018 103 E572F 897r V4 344
Kim_V4_2016 22 528F Nex_18S_0964_R V4 419
Comeau 17 E572F E1009R V4 438
Mangot 25 NSF563 NSR951 V4 380
Xu_2020 99 A-528F B-706R V4 344
Hadziavdic_2 2 A-528F R-952 V4 379
Egge 39 A-528F PRYM01+7 V4 396
Fadev_2018 98 A-528F V4RB V4 408
Hugerth_6 83 A-528F 1132r V4 577
Kilias_2013 100 A-528F 1055R V4 709
Bass_2020 119 574*f UNonMet DB V4 566
Hugerth_1 3 574*f 1132r V4 576
Hugerth_5 77 574f 1132r V4 576
Venter 23 590F 1300R V4 720
Zimmerman 21 D512for D978rev V4 437
Harder 41 Cerc479F Cerc750R V4 283
Simon 24 EK-565F-NGS EUK1134-R V4 527
Fiore-Donno_2018c 65 S616F_Cerco S947R_Cerco V4 348
Fiore-Donno_2018d 66 S616F_Eocer S947R_Cerco V4 348
Fiore-Donno_2018a 63 S616F_Cerco S963R_Cerco V4 367
Fiore-Donno_2018b 64 S616F_Eocer S963R_Cerco V4 367
Hugerth_3 5 616f 1132r V4 534
Hugerth_4 6 616*f 1132r V4 534
UNonMet 35 EUK581-F EUK1134-R V4
Stokes_2002 96 QPX-F QPX-R2 V4
Hu_2016 101 574*f 1132R modified V4-V5 576
Michaud_2019b 88 Par_18S-F Par_18S-R V5 562
Moro 38 ChloroF ChloroR V5 494
Stokes_2002 97 LABY-A LABY-Y V6 434
Wilkins 32 926wF 1392-R V6-V8 513
Huo_2020 107 960F NSR1438 V7 264
Lundgreen_2019 76 F-1183 R-1443 V7 261
Chemidlin_2011 92 FF390 FR1 V7-V8 349
Kim_V9_2016 106 Nex_18S_1434_F Nex_18S_1757_R V8-V9 371
Bradley_2016_V9 89 V8f 1510R V8-V9 372
Amaral_1 28 1380F 1510R V9 176
Amaral_2 29 1389F 1510R V9 167
Piredda_V9 31 1388F 1510R V9 167
Stoeck_V9 27 1391F EukB V9 169

5 Computing the matches

This part is done by an R script script_primers_pr2_match.R that is executed in batch mode on Roscoff server.

6 Read the file for all primer sets and filter

This file is created by the R script script_update files.R

# For testing

pr2_match_final <- readRDS(file = str_c("output/pr2_match_", gene_selected, "_mismatches_", max_mismatch, ".rds"))

print(str_c("Before filtration: ", nrow(pr2_match_final)))
[1] "Before filtration: 11053938"
# Replace the gene_region and primer_set_label_short since they can be changed in the database
primer_sets_labels <- primer_sets %>% mutate(primer_set_label_short = str_c(str_sub(gene_region, 1, 2), sprintf("%03d", primer_set_id), str_sub(str_replace_na(specificity, 
    replacement = ""), 1, 3), sep = " "), primer_set_label_long = str_c(gene_region, primer_set_name, "-", str_replace_na(specificity, "general"), 
    sep = " ")) %>% # Remove the last underscore if left by itself
mutate(primer_set_label_short = str_replace(primer_set_label_short, " $", "")) %>% select(primer_set_id, primer_set_label_short, primer_set_label_long, 
    gene_region, specific, specificity)

pr2_match_final <- pr2_match_final %>% left_join(primer_sets_labels) %>% # Remove sequences for which the introns have been removed
filter(!str_detect(pr2_accession, "_UC")) %>% # Remove sequence that are shorter
filter((str_detect(gene_region, "V4") & sequence_length >= sequence_length_min_V4) | (str_detect(gene_region, "V9") & sequence_length >= sequence_length_min_V9) | 
    (!str_detect(gene_region, "V4|V9") & sequence_length >= sequence_length_min)) %>% mutate(mismatch_number = fwd_mismatch_number + rev_mismatch_number) %>% 
    # Only keep the selected primers
filter(primer_set_id %in% primer_sets$primer_set_id)

print(str_c("After filtration: ", nrow(pr2_match_final)))
[1] "After filtration: 6807694"

7 Read the summarized tables

pr2_match_summary_primer_set <- readRDS(file = str_c("output/pr2_match_", gene_selected, "_mismatches_", max_mismatch, "_summary.rds"))
pr2_match_summary_primer_set_sg <- readRDS(file = str_c("output/pr2_match_", gene_selected, "_mismatches_", max_mismatch, "_summary_sg.rds"))
pr2_match_summary_primer_set_class <- readRDS(file = str_c("output/pr2_match_", gene_selected, "_mismatches_", max_mismatch, "_summary_class.rds"))


# Long form for Number of sequences

# This dataframe is used to re-order the bars correctly
pct_category_order <- data.frame(pct_category = c("ampli_pct", "fwd_pct", "rev_pct"), pct_category_order = c(1, 3, 2))

pr2_match_summary_primer_set_long <- pr2_match_summary_primer_set %>% tidyr::pivot_longer(names_to = "pct_category", values_to = "pct_seq", cols = fwd_pct:ampli_pct) %>% 
    left_join(pct_category_order)

8 Tables

8.1 Path for tables

table_path = "C:/daniel.vaulot@gmail.com/Papers/2020 Vaulot primers/paper-primers-overleaf-1.0/tables/"

8.2 Primers

table_primers <- primers %>% mutate(specific = ifelse(is.na(specificity), "general", "specific")) %>% filter(!str_detect(specificity, "blocking") | 
    is.na(specificity)) %>% relocate(specific, .before = specificity) %>% group_by(direction, specific) %>% count() %>% arrange(-n) %>% tidyr::pivot_wider(names_from = specific, 
    values_from = n)

8.3 Primers sets

table_primer_sets <- primer_sets %>% group_by(gene_region, specific) %>% count() %>% arrange(gene_region) %>% tidyr::pivot_wider(names_from = specific, 
    values_from = n) %>% relocate(general, .before = specific)

8.4 Primer sets statistics

table_primer_sets_ampli <- pr2_match_summary_primer_set %>% filter(kingdom == "Eukaryota") %>% group_by(specific) %>% summarise(across(.cols = all_of(c("fwd_pct", 
    "rev_pct", "ampli_pct", "ampli_size_mean")), .fns = list(min = min, mean = mean, max = max))) %>% tidyr::pivot_longer(cols = matches("min|max|mean"), 
    names_to = "parameter", values_to = "values") %>% tidyr::pivot_wider(names_from = specific, values_from = values)

8.5 Save to Excel file

onglets = list(table_primers = table_primers, table_primer_sets = table_primer_sets, table_primer_sets_ampli = table_primer_sets_ampli)
file_name <- str_c(table_path, "tables_R.xlsx")
openxlsx::write.xlsx(onglets, file_name, zoom = 90, firstRow = TRUE, firstCol = TRUE)

8.6 Export to Latex tables

library(xtable)

8.6.1 Define function to format the tables

  • Not used
    • “X” = “\\cellcolor{gray}”
sanitize.italics <- function(str) {
  str_replace_all(str, c("_" = " ", 
                         # "ital\\{" = "\\\\textit{", 
                         "°" = "\\\\degree",
                         "±"="$\\pm$"))
}


# See: https://www.rdocumentation.org/packages/xtable/versions/1.8-4/topics/print.xtable

latex_table <- function(table, caption, label, file, align, 
                        scalebox = getOption("xtable.scalebox", NULL), 
                        digits = 0, 
                        add.to.row = getOption("xtable.add.to.row", NULL),
                        include.colnames = TRUE, 
                        tabular.environment = getOption("xtable.tabular.environment", "tabular") ) {
  
    table <- xtable::xtable(table,
                            label=label,
                            caption=caption,
                            align = align,
                            digits=digits)
    print(table,  scalebox = scalebox,
          caption.placement = "top",
          include.rownames = FALSE,
          add.to.row = add.to.row,
          include.colnames = include.colnames, 
          tabular.environment = tabular.environment,
          file=file,
          sanitize.text.function = sanitize.italics)
    }

8.6.2 File path

full_file_name <- function(file_name) {
    str_c(table_path, file_name)
}

8.6.3 Table - Primers

latex_table(table = table_primers, caption = "Type of primers listed in the pr2-primers database. General primers target all eukaryotes and specific primers only certain taxonomic groups.", 
    label = "tab:primers", file = full_file_name("table_primers.tex"), align = c("l", "c", "c", "c"), scalebox = 1, digits = 0)

8.6.4 Table - Primer sets

latex_table(table = table_primer_sets, caption = "Regions of the 18S rRNA gene targeted by the primer sets from the pr2-primers database.", label = "tab:primer_sets", 
    file = full_file_name("table_primer_sets.tex"), align = c("l", "l", "c", "c"), scalebox = 1, digits = 0)

8.6.5 Table - Primer sets stats

latex_table(table = table_primer_sets_ampli, caption = "Overall characteristics of primer sets listed in the pr2-primers database.", label = "tab:primer_sets_ampli", 
    file = full_file_name("table_primer_sets_ampli.tex"), align = c("l", "l", "c", "c"), scalebox = 1, digits = 1)

8.7 Supplementary Table - Primers

table <- primers %>% 
  select(primer_id, name, sequence, direction, start_yeast, specificity, doi)
 
add.to.row <- list()
add.to.row$pos <- list(0, 0, 0, 0, 0)
add.to.row$command <- c( 
  # Header for first page
  "id & Name & Sequence & Direction & Start (yeast) & Specificity & DOI \\\\\n",
  "\\endfirsthead \n \\hline \n",
  
  # Header for other pages
  "id & Name & Sequence & Direction & Start (yeast) & Specificity & DOI \\\\\n",
  "\\hline \n \\endhead \n",
  
  # Footers
  "\\hline \n \\endfoot \n  \\endlastfoot \n"
)


latex_table(table = table,
            caption = "List of primers in the pr2-primers database ordered by start position relative to the sequence of the yeast \textit{Saccharomyces cerevisiae} (FU970071).",
            label = "tabsup:primers",
            file = full_file_name("table_sup_primers.tex"),
            align = c("l","l","l", "l","l","l","l", "l" ),
            digits= 0,
            add.to.row = add.to.row,
            include.colnames = FALSE, 
            tabular.environment = "longtable"
)

8.8 Supplementary Table - Primer sets

table <- primer_sets %>% 
  select(primer_set_id, primer_set_name, fwd_name, rev_name, gene_region, specificity, doi) %>% 
  arrange(primer_set_id)
 
add.to.row <- list()
add.to.row$pos <- list(0, 0, 0, 0, 0)
add.to.row$command <- c( 
  # Header for first page
  "id & Name & Primer fwd & Primer rev & Region & Specificity & DOI \\\\\n",
  "\\endfirsthead \n \\hline \n",
  
  # Header for other pages
  "id & Name & Primer fwd & Primer rev & Region & Specificity & DOI \\\\\n",
  "\\hline \n \\endhead \n",
  
  # Footers
  "\\hline \n \\endfoot \n  \\endlastfoot \n"
)


latex_table(table = table,
            caption = "List of primer sets in the pr2-primers database.",
            label = "tabsup:primer_sets",
            file = full_file_name("table_sup_primer_sets.tex"),
            align = c("l","l","l", "l","l","l","l", "l" ),
            digits= 0,
            add.to.row = add.to.row,
            include.colnames = FALSE, 
            tabular.environment = "longtable"
)

9 Graphics

9.1 Common parameters

ncol = 8

fig3 <- list()

fig1 <- list()

9.2 Amplicon length

9.2.1 Scatter

ggplot(filter(pr2_match_final, kingdom == "Eukaryota"), aes(x = primer_set_label_short, y = ampli_size, group = primer_set_id, 
    color = as.factor(mismatch_number))) + geom_point(size = 3) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1)) + labs(x = "Primer set") + scale_color_viridis_d() + facet_wrap(vars(specific), nrow = 2, 
    scales = "free_x")

9.2.2 Average size

ggplot(filter(pr2_match_final, kingdom == "Eukaryota"), aes(x = primer_set_label_short, y = ampli_size, group = primer_set_id, 
    fill = as.factor(mismatch_number))) + geom_boxplot(outlier.alpha = 0.3) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1)) + labs(x = "Primer set") + scale_fill_viridis_d() + facet_wrap(vars(specific), nrow = 2, 
    scales = "free_x")

9.3 Example with sets V4 and V9

9.3.1 Plot an example of amplicon distribution (Fig. 3)

for (one_primer_set in c(8, 27)) {
    
    if (one_primer_set == 8) {
        xmin = 400
        xmax = 450
        xmax2 = 2000
    } else {
        xmin = 140
        xmax = 190
        xmax2 = 1000
        
    }
    
    pr2_match_final_one <- pr2_match_final %>% filter(primer_set_id == one_primer_set) %>% filter(kingdom == 
        "Eukaryota") %>% filter(!(supergroup %in% c("Apusozoa", "Eukaryota_X", "Protoalveoalata")))
    
    primer_set_label_long = pr2_match_final_one$primer_set_label_long[1]
    
    
    g <- ggplot(pr2_match_final_one, aes(x = ampli_size)) + geom_density(fill = "blue", alpha = 0.9) + xlab("Amplicon size") + 
        ggtitle(str_c("Primer set - ", primer_set_label_long)) + xlim(xmin, xmax)
    
    print(g)
    
    g <- ggplot(pr2_match_final_one, aes(x = ampli_size, fill = supergroup)) + geom_density(alpha = 0.9) + 
        theme_bw() + # theme(legend.position = 'none') +
    guides(fill = guide_legend(nrow = 3)) + theme(legend.position = "top", legend.box = "horizontal") + scale_fill_viridis_d(option = "inferno") + 
        labs(x = "Amplicon size (bp)", y = "Density", fill = "Supergroup") + # ggtitle(str_c('Primer set - ', primer_set_label_long)) +
    xlim(xmin, xmax)
    
    print(g)
    fig3[[str_c(one_primer_set, " size_distri")]] <- g
    
    g <- ggplot(pr2_match_final_one, aes(x = supergroup, y = ampli_size)) + geom_boxplot(outlier.alpha = 0.3) + 
        theme_bw() + coord_flip() + # theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    xlab("Supergroup") + ylab("Amplicon size (bp)") + ylim(0, xmax2) + geom_hline(yintercept = c(450, 550), 
        linetype = c(2, 3))
    
    print(g)
    fig3[[str_c(one_primer_set, " size")]] <- g
    
    g <- ggplot(pr2_match_final_one, aes(x = supergroup, y = ampli_size)) + geom_violin() + coord_flip() + 
        # theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    xlab("Supergroup") + ylab("Amplicon size (bp)")
    print(g)
    
}

9.3.2 Plot an example of percent amplification

for (one_primer_set in c(8, 27)) {
    
    pr2_match_summary_filtered <- pr2_match_summary_primer_set_sg %>% filter((n_seq > 20) & (primer_set_id == one_primer_set) & (kingdom == "Eukaryota") & 
        !(supergroup %in% c("Apusozoa", "Eukaryota_X")))
    
    
    
    g <- ggplot(pr2_match_summary_filtered) + geom_col(data = pr2_match_summary_filtered, aes(x = str_c(supergroup, " - n = ", n_seq), y = ampli_pct, 
        fill = supergroup), position = "dodge") + theme_bw() + # theme(legend.position = 'none') +
    guides(fill = guide_legend(nrow = 2)) + theme(legend.position = "top", legend.box = "horizontal") + scale_fill_viridis_d(option = "inferno") + 
        coord_flip() + labs(x = "% of sequences amplified", y = "", legend = "Supergroup") + scale_y_continuous(minor_breaks = seq(0, 100, by = 10), 
        breaks = seq(0, 100, by = 20), limits = c(0, 100))
    
    # ggtitle (str_c('Set - ', one_primer_set, ' - % amplified per Supergroup') )
    
    print(g)
    fig3[[str_c(one_primer_set, " pct")]] <- g
    
    g <- ggplot(filter(pr2_match_summary_filtered, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(supergroup, " - n = ", n_seq), y = ampli_size_mean), 
        colour = "black") + coord_flip() + geom_errorbar(aes(x = str_c(supergroup, " - n = ", n_seq), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - 
        ampli_size_sd)) + xlab("Supergroup") + ylab("Amplicon size (bp)")  # +
    # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) + ggtitle (str_c('Set - ', one_primer_set, ' - Amplicon sizes') ) +
    # geom_hline(yintercept = c(450,550) , linetype= 2)
    
    print(g)
    
    
    
    
}

9.3.3 Plot an example of mismatches

for (one_primer_set in c(8, 27)) {
    
    pr2_mismatches <- pr2_match_summary_primer_set_sg %>% tidyr::pivot_longer(names_to = "mismatch_number", values_to = "mismatch_pct", cols = contains("ampli_mismatch"), 
        names_prefix = "ampli_mismatch_") %>% select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number, 1, 1)) %>% 
        mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>% filter((n_seq > 20) & (primer_set_id == one_primer_set) & (kingdom == 
        "Eukaryota") & !(supergroup %in% c("Apusozoa", "Eukaryota_X")))
    
    g <- ggplot(pr2_mismatches, aes(x = str_c(supergroup, " - n = ", n_seq), y = mismatch_pct, group = primer_set_id, fill = as.factor(mismatch_number))) + 
        geom_col() + theme_bw() + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + labs(x = "Supergroup", y = "% of sequences with mismatches", 
        fill = "Mismatches") + scale_fill_viridis_d(direction = -1, alpha = 0.85) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", 
        legend.box = "horizontal") + coord_flip()
    
    print(g)
    
    fig3[[str_c(one_primer_set, "mismatches", sep = " ")]] <- g
    
    
    print(g)
    
    
}

9.3.4 Plot an example of mismatches positions

for (one_primer_set in c(8, 27)) {
    
    df <- pr2_match_final %>% filter(primer_set_id == one_primer_set) %>% filter(kingdom == "Eukaryota") %>% filter(!(supergroup %in% c("Apusozoa", 
        "Eukaryota_X")))
    
    g <- ggplot() + geom_histogram(data = filter(df, !is.na(fwd_mismatch_primer_position_5prime)), aes(x = fwd_mismatch_primer_position_5prime, fill = supergroup), 
        binwidth = 1, stat = "bin", alpha = 1) + scale_fill_viridis_d(option = "inferno") + theme_bw() + scale_x_continuous(minor_breaks = 0:30) + 
        labs(y = "Density", x = "Position of mismatches from 5' end", title = "fwd")
    
    print(g)
    fig3[[str_c(one_primer_set, "mismatches positions fwd", sep = " ")]] <- g
    
    g <- ggplot() + geom_histogram(data = filter(df, !is.na(rev_mismatch_primer_position_5prime)), aes(x = rev_mismatch_primer_position_5prime, fill = supergroup), 
        binwidth = 1, stat = "bin", alpha = 1) + scale_fill_viridis_d(option = "inferno") + theme_bw() + scale_x_continuous(minor_breaks = 0:30) + 
        labs(y = "Density", x = "Position of mismatches from 5' end", title = "rev")
    
    print(g)
    fig3[[str_c(one_primer_set, "mismatches positions rev", sep = " ")]] <- g
    
}

9.4 All Eukaryotes

Comments

  • Percent of sequences amplified
    • Logically, the % of seq amplified is always < min(% of sequences matching forwar, % of sequences matching reverse)
    • In general it is the reverse primer that causes problems.
    • Some primer sets do not amplify any sequence (11, 19, 20, 21)
    • For V9, primer set 30 reverse primer is in the ITS region which is not present in the PR2 sequences, so no amplification.
  • Amplicon sizes
    • Only 8 V4 primer sets are suitable for Illumina 2x250 (max amplicon size = 450 bp)
    • This is if we consider the mean… If we consider the variation around the mean then, only 3 suitable for 2x250
    • Five more V4 primer sets are suitable for Illumina 3x250 (max amplicon size = 550 bp)

9.4.1 Plot only V4 and V9 - For Fig. 2

for (one_kingdom in kingdoms) {
    
    for (one_region in gene_regions) {
        
        pr2_match_summary_primer_set_region_long <- pr2_match_summary_primer_set_long %>% filter(kingdom == one_kingdom) %>% filter(gene_region == 
            one_region) %>% # Remove the group specific primers
        filter(specific == "general") %>% filter(pct_category %in% c("ampli_pct", "fwd_pct", "rev_pct"))
        
        pr2_match_summary_primer_set_region <- pr2_match_summary_primer_set %>% filter(kingdom == one_kingdom) %>% filter(gene_region == one_region) %>% 
            # Remove the group specific primers)
        filter(specific == "general")
        
        pr2_match_region <- pr2_match_final %>% filter(kingdom == one_kingdom) %>% filter(gene_region == one_region) %>% # Remove the group specific primers
        filter(specific == "general")
        
        # % Ampli
        
        g <- ggplot(pr2_match_summary_primer_set_region_long) + geom_col(aes(x = str_replace(str_c(sprintf("%03d", primer_set_id), primer_set_label_long, 
            sep = " - "), " - general", ""), y = pct_seq, fill = fct_reorder(pct_category, pct_category_order)), width = 0.7, position = "dodge") + 
            theme_bw() + xlab("Primer set") + ylab("% of sequences amplified") + scale_fill_manual(name = "", values = c(ampli_pct = "black", fwd_pct = "blue", 
            rev_pct = "red"), labels = c("Amplicons", "rev", "fwd")) + ggtitle(str_c(one_kingdom, one_region, "Percentage of sequences recovered", 
            sep = " - ")) + theme(axis.text.x = element_text(angle = 0, hjust = 1)) + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + 
            ylim(0, 100) + coord_flip() + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal")
        
        print(g)
        
        # Size
        
        fig1[[str_c(one_kingdom, one_region, "pct", sep = " ")]] <- g
        
        
        g <- ggplot(filter(pr2_match_summary_primer_set_region, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(sprintf("%03d", primer_set_id), 
            primer_set_label_long, sep = " - "), y = ampli_size_mean), colour = "black") + geom_errorbar(aes(x = str_c(sprintf("%03d", primer_set_id), 
            primer_set_label_long, sep = " - "), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) + theme_bw() + xlab("Primer set") + 
            ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
        ggtitle(str_c(one_kingdom, one_region, "Amplicon size", sep = " - ")) + geom_hline(yintercept = c(450, 550), linetype = c(2, 3)) + ylim(0, 
            850) + theme(axis.text.x = element_text(angle = 0, hjust = 1)) + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + coord_flip()
        
        print(g)
        
        fig1[[str_c(one_kingdom, one_region, "size", sep = " ")]] <- g
        
        # Mismatches
        
        pr2_mismatches <- pr2_match_summary_primer_set %>% filter(kingdom == one_kingdom) %>% tidyr::pivot_longer(names_to = "mismatch_number", values_to = "mismatch_pct", 
            cols = contains("ampli_mismatch"), names_prefix = "ampli_mismatch_") %>% select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number, 
            1, 1)) %>% mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>% filter(gene_region == one_region) %>% # Remove the group specific primers
        filter(specific == "general")
        
        g <- ggplot(pr2_mismatches, aes(x = str_c(sprintf("%03d", primer_set_id), primer_set_label_long, sep = " - "), y = mismatch_pct, group = primer_set_id, 
            fill = as.factor(mismatch_number))) + geom_col() + theme_bw() + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + labs(x = "Primer set", 
            y = "% of sequences with mismatches", fill = "Mismatches", title = str_c(one_kingdom, one_region, sep = " - ")) + scale_fill_viridis_d(direction = -1, 
            alpha = 0.85) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal") + coord_flip()
        
        print(g)
        
        fig1[[str_c(one_kingdom, one_region, "mismatches", sep = " ")]] <- g
        
        
    }
}

9.4.2 Plot all with panel general and specific - For Fig S1

for (one_kingdom in kingdoms) {
    
    for (specific_one in c("general", "specific")) {
        
        df <- pr2_match_summary_primer_set_long %>% filter(kingdom == one_kingdom) %>% filter(pct_category %in% c("ampli_pct", "fwd_pct", "rev_pct")) %>% 
            filter(specific == specific_one)
        
        g <- ggplot(df) + geom_col(aes(x = str_replace(str_c(primer_set_label_long, sprintf("%03d", primer_set_id), sep = " - "), " - general", ""), 
            y = pct_seq, fill = fct_reorder(pct_category, pct_category_order)), width = 0.7, position = "dodge") + theme_bw() + xlab("Primer set") + 
            ylab("% of sequences amplified") + scale_fill_manual(name = "", values = c(ampli_pct = "black", fwd_pct = "blue", rev_pct = "red"), labels = c("Amplicons", 
            "rev", "fwd")) + ggtitle(str_c(one_kingdom, specific_one, "Percentage of sequences recovered", sep = " - ")) + theme(axis.text.y = element_text(angle = 0, 
            hjust = 0, vjust = 0)) + ylim(0, 100) + coord_flip() + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal")
        
        
        print(g)
        
        fig1[[str_c(one_kingdom, specific_one, "pct", sep = " ")]] <- g
        
        df <- pr2_match_summary_primer_set %>% filter(kingdom == one_kingdom) %>% filter(!is.nan(ampli_size_mean)) %>% filter(specific == specific_one)
        
        g <- ggplot(df) + geom_point(aes(x = str_c(primer_set_label_long, sprintf("%03d", primer_set_id), sep = " - "), y = ampli_size_mean), colour = "black") + 
            geom_errorbar(aes(x = str_c(primer_set_label_long, sprintf("%03d", primer_set_id), sep = " - "), ymax = ampli_size_mean + ampli_size_sd, 
                ymin = ampli_size_mean - ampli_size_sd)) + theme_bw() + xlab("Primer set") + ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
        ggtitle(str_c(one_kingdom, specific_one, " - Amplicon size", sep = " - ")) + geom_hline(yintercept = c(450, 550), linetype = c(2, 3)) + ylim(0, 
            900) + theme(axis.text.y = element_text(angle = 0, hjust = 0)) + coord_flip()
        
        print(g)
        
        fig1[[str_c(one_kingdom, specific_one, "size", sep = " ")]] <- g
        
    }
}

9.5 Mismatches - for Fig.S1

9.5.1 Number of mismatches

for (one_kingdom in kingdoms) {
    
    for (specific_one in c("general", "specific")) {
        
        pr2_mismatches <- pr2_match_summary_primer_set %>% tidyr::pivot_longer(names_to = "mismatch_number", values_to = "mismatch_pct", cols = contains("ampli_mismatch"), 
            names_prefix = "ampli_mismatch_") %>% select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number, 1, 1)) %>% 
            mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>% filter(specific == specific_one) %>% filter(kingdom == one_kingdom)
        
        g <- ggplot(pr2_mismatches, aes(x = str_c(primer_set_label_long, sprintf("%03d", primer_set_id), sep = " - "), y = mismatch_pct, group = primer_set_id, 
            fill = as.factor(mismatch_number))) + geom_col() + theme_bw() + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + labs(x = "Primer set", 
            y = "% of sequences with mismatches", fill = "Mismatches", title = str_c(one_kingdom, specific_one, sep = " - ")) + scale_fill_viridis_d(direction = -1, 
            alpha = 0.85) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal") + coord_flip()
        
        print(g)
        
        fig1[[str_c(one_kingdom, specific_one, "mismatches", sep = " ")]] <- g
        
        
    }
}

9.5.2 Position of mismatches (only for Eukaryota)

ggplot(filter(pr2_match_final, kingdom == "Eukaryota"), aes(x = primer_set_label_long)) + geom_boxplot(aes(y = fwd_mismatch_primer_position_5prime), 
    outlier.alpha = 0.3, color = "blue") + coord_flip() + theme_bw() + theme(axis.text.y = element_text(angle = 0, vjust = 0, hjust = 0)) + scale_y_continuous(minor_breaks = 0:25, 
    breaks = seq(0, 25, by = 5), limits = c(0, 25)) + labs(x = "Primer set", y = "Position of mismatches", title = "Primer fwd") + facet_wrap(vars(specific), 
    nrow = 2, scales = "free_y")

ggplot(filter(pr2_match_final, kingdom == "Eukaryota"), aes(x = primer_set_label_long)) + geom_boxplot(aes(y = rev_mismatch_primer_position_5prime), 
    outlier.alpha = 0.3, color = "red") + coord_flip() + theme_bw() + theme(axis.text.y = element_text(angle = 0, vjust = 0, hjust = 0)) + scale_y_continuous(minor_breaks = 0:25, 
    breaks = seq(0, 25, by = 5), limits = c(0, 25)) + labs(x = "Primer set", y = "Position of mismatches", title = "Primer rev") + facet_wrap(vars(specific), 
    nrow = 2, scales = "free_y")

ggplot(filter(pr2_match_final, !is.na(rev_mismatch_primer_position_5prime), kingdom == "Eukaryota")) + geom_histogram(aes(x = rev_mismatch_primer_position_5prime), 
    color = "red", binwidth = 1, stat = "density") + theme_bw() + scale_x_continuous(minor_breaks = 0:30, breaks = seq(0, 30, by = 5), limits = c(0, 
    30)) + labs(y = "Primer set", x = "Position of mismatches", title = "Primer rev") + facet_wrap(vars(primer_set_label_long), ncol = ncol)

## By supergroup (only for Eukaryota) - For Fig. S2 and S5

Comments:

  • Excavata have a very different patterns from the rest of the group. They are not amplified by quite a few primer sets. They have also bigger amplicons
  • Some groups exhibit higher variability in amplicon size (e.g Chlorophyta)

9.5.3 % of Ampli and Amplicon size

fig_supergroup <- list()
ncol = 8

for (specific_one in c("general", "specific")) {
    
    pr2_match_summary_primer_set_sg_region <- pr2_match_summary_primer_set_sg %>% filter(kingdom == "Eukaryota") %>% 
        filter(specific == specific_one) %>% filter((n_seq > 20)) %>% filter(!(supergroup %in% c("Apusozoa", 
        "Eukaryota_X")))
    
    
    
    g <- ggplot(pr2_match_summary_primer_set_sg_region) + geom_col(aes(x = supergroup, y = ampli_pct, fill = supergroup), 
        position = "dodge") + theme_bw() + coord_flip() + ylab("% of sequences amplified") + xlab("Supergroup") + 
        ggtitle(str_c(specific_one, " primers")) + scale_fill_viridis_d(option = "inferno") + ylim(0, 100) + 
        facet_wrap(~primer_set_label_short, scales = "fixed", ncol = ncol) + guides(fill = guide_legend(nrow = 1)) + 
        theme(legend.position = "top", legend.box = "horizontal") + theme(legend.position = "none")
    
    print(g)
    
    list_label <- str_c("pct", specific_one, sep = " ")
    fig_supergroup[[list_label]] <- g
    
    
    
    g <- ggplot(filter(pr2_match_summary_primer_set_sg_region, !is.nan(ampli_size_mean))) + geom_point(aes(x = supergroup, 
        y = ampli_size_mean), colour = "black") + theme_bw() + coord_flip() + geom_errorbar(aes(x = supergroup, 
        ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) + xlab("Supergroup") + 
        ylab("Amplicon size (bp)") + ggtitle(str_c(specific_one, " primers")) + geom_hline(yintercept = c(450, 
        550), linetype = 2) + facet_wrap(~primer_set_label_short, scales = "fixed", ncol = ncol)
    
    print(g)
    
    list_label <- str_c("size", specific_one, sep = " ")
    
    fig_supergroup[[list_label]] <- g
    
}

9.5.4 Number of mismatches (Only Eukaryotes) - Fig. S2 and S5

ncol = 8

pr2_mismatches_sg <- pr2_match_summary_primer_set_sg %>% filter(kingdom == "Eukaryota") %>% tidyr::pivot_longer(names_to = "mismatch_number", values_to = "mismatch_pct", 
    cols = contains("ampli_mismatch"), names_prefix = "ampli_mismatch_") %>% select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number, 
    1, 1)) %>% mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>% filter((n_seq > 20)) %>% filter(!(supergroup %in% c("Apusozoa", 
    "Eukaryota_X")))

for (specific_one in c("general", "specific")) {
    
    pr2_mismatches_sg_one <- pr2_mismatches_sg %>% filter(specific == specific_one)
    
    g <- ggplot(pr2_mismatches_sg_one, aes(x = supergroup, y = mismatch_pct, fill = fct_rev(mismatch_number))) + geom_col() + theme_bw() + theme(axis.text.x = element_text(angle = 0, 
        hjust = 1)) + labs(x = "Group", y = "% of sequences with mismatches", title = str_c(specific_one, " primers"), fill = "Mismatches") + scale_fill_viridis_d(direction = 1, 
        alpha = 0.85) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal") + coord_flip() + facet_wrap(vars(primer_set_label_short), 
        ncol = ncol)
    
    print(g)
    
    fig_supergroup[[str_c("mismatches", specific_one, sep = " ")]] <- g
}

9.6 By class for autotrophs (Only EUkaryotes) - Fig. S6

fig_class <- list()
ncol = 8

for (specific_one in c("general", "specific")) {
    
    pr2_match_summary_filtered_one <- pr2_match_summary_primer_set_class %>% filter(n_seq > 20) %>% filter(division %in% 
        c("Haptophyta", "Dinoflagellata", "Chlorophyta", "Ochrophyta", "Cryptophyta")) %>% filter(specific == 
        specific_one)
    
    
    g <- ggplot(pr2_match_summary_filtered_one) + geom_col(aes(x = str_c(str_trunc(division, 20, ellipsis = ""), 
        "-", class), y = ampli_pct, fill = division), position = "dodge") + theme_bw() + coord_flip() + ylab("% of sequences amplified") + 
        xlab("Class") + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + # scale_fill_viridis_d(option = 'plasma') +
    scale_fill_brewer(palette = "Accent") + ylim(0, 100) + facet_wrap(vars(primer_set_label_short), scales = "fixed", 
        ncol = ncol) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal")
    
    print(g)
    
    list_label <- str_c("pct", specific_one, sep = " ")
    
    fig_class[[list_label]] <- g
    
    g <- ggplot(filter(pr2_match_summary_filtered_one, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(str_trunc(division, 
        3, ellipsis = ""), "-", class), y = ampli_size_mean), colour = "black") + geom_errorbar(aes(x = str_c(str_trunc(division, 
        3, ellipsis = ""), "-", class), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - 
        ampli_size_sd)) + theme_bw() + coord_flip() + theme(axis.text.y = element_text(angle = 0, hjust = 0, 
        vjust = 0)) + xlab("Class") + ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) + ggtitle (str_c('Set -', one_primer_set, '
    # - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively') ) +
    geom_hline(yintercept = c(450, 550), linetype = 2) + facet_wrap(~primer_set_label_short, scales = "fixed", 
        ncol = ncol)
    
    print(g)
    
    list_label <- str_c("size", specific_one, sep = " ")
    
    fig_class[[list_label]] <- g
}

10 Specific analyses

10.1 Specific et sets for Opisthokonta

  • 35 UnNonMet
  • 16 Piredda
  • 17 Comeau
for (one_primer_set in c(16, 17, 35)) {
    
    pr2_match_summary_filtered <- filter(pr2_match_summary_primer_set_class, (n_seq > 20) & (supergroup %in% 
        c("Opisthokonta")) & (primer_set_id == one_primer_set))
    
    g <- ggplot(pr2_match_summary_filtered) + geom_col(data = pr2_match_summary_filtered, aes(x = str_c(division, 
        "-", class, " - n= ", n_seq), y = ampli_pct), fill = "grey", position = "dodge") + theme(axis.text.y = element_text(angle = 0, 
        hjust = 0, vjust = 0)) + theme_bw() + coord_flip() + ylab("% of sequences amplified") + xlab("Class") + 
        ggtitle(str_c("Set -", one_primer_set, " - % amplified per Class"))
    
    print(g)
    
    g <- ggplot(filter(pr2_match_summary_filtered, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(division, 
        "-", class, " - n= ", n_seq), y = ampli_size_mean), colour = "black") + geom_errorbar(aes(x = str_c(division, 
        "-", class, " - n= ", n_seq), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) + 
        theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + coord_flip() + xlab("Class") + 
        ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
    ggtitle(str_c("Set -", one_primer_set, " - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively")) + 
        geom_hline(yintercept = c(450, 550), linetype = 2)
    
    print(g)
    
}

11 Figures

11.1 Fig. 2, S3 and S4 - V4 and V9 - Amplification % and size

11.1.1 Eukaryota - Fig. 2

fig_1 <- (fig1[["Eukaryota V4 pct"]] + ggtitle("") + ylab("") + fig1[["Eukaryota V4 mismatches"]] + ggtitle("") + xlab("") + ylab("") + theme(axis.text.y = element_blank()) + 
    fig1[["Eukaryota V4 size"]] + ggtitle("") + xlab("") + ylab("") + theme(axis.text.y = element_blank()))/(fig1[["Eukaryota V9 pct"]] + ggtitle("") + 
    theme(legend.position = "none") + fig1[["Eukaryota V9 mismatches"]] + ggtitle("") + theme(legend.position = "none") + xlab("") + theme(axis.text.y = element_blank()) + 
    fig1[["Eukaryota V9 size"]] + ggtitle("") + xlab("") + theme(axis.text.y = element_blank())) + plot_layout(heights = c(20, 4))
fig_1

ggsave(plot = fig_1, filename = "figs/fig_pct_sizes_mismatches_V4_V9_euk.pdf", width = 17, height = 12, scale = 2, units = "cm", useDingbats = FALSE)

11.1.2 Bacteria - Fig. S3

fig_1_bact <- (fig1[["Bacteria V4 pct"]] + ggtitle("") + ylab("") + fig1[["Bacteria V4 mismatches"]] + ggtitle("") + xlab("") + ylab("") + theme(axis.text.y = element_blank()) + 
    fig1[["Bacteria V4 size"]] + ggtitle("") + xlab("") + ylab("") + theme(axis.text.y = element_blank()))
fig_1_bact

ggsave(plot = fig_1_bact, filename = "figs/fig_pct_sizes_mismatches_V4_bact.pdf", width = 17, height = 10, scale = 2, units = "cm", useDingbats = FALSE)

11.1.3 Archaea - Fig. S4

fig_1_arch <- (fig1[["Archaea V4 pct"]] + ggtitle("") + ylab("") + fig1[["Archaea V4 mismatches"]] + ggtitle("") + xlab("") + ylab("") + theme(axis.text.y = element_blank()) + 
    fig1[["Archaea V4 size"]] + ggtitle("") + xlab("") + ylab("") + theme(axis.text.y = element_blank()))
fig_1_arch

ggsave(plot = fig_1_arch, filename = "figs/fig_pct_sizes_mismatches_V4_arch.pdf", width = 17, height = 10, scale = 2, units = "cm", useDingbats = FALSE)

11.2 Fig. S1 - All - Amplification % and size and mismatches

legend_pct <- cowplot::get_legend( fig1[["Eukaryota general pct"]] + 
                      # create some space to the left of the legend    
                      theme(legend.box.margin = margin(0, 0, 0, 20))
                    )

legend_mismatches <- cowplot::get_legend( fig1[["Eukaryota general mismatches"]] + 
                      # create some space to the left of the legend    
                      theme(legend.box.margin = margin(0, 0, 0, 20))
                    )


fig_1 <-( fig1[["Eukaryota general pct"]]  +ggtitle("")+ ylab("") + xlab("") +
          fig1[["Eukaryota general mismatches"]] +ggtitle("") +
            xlab("") + ylab("") + 
            theme(axis.text.y = element_blank()) + 
          fig1[["Eukaryota general size"]] +ggtitle("") + xlab("") + ylab("")+ 
            theme(axis.text.y = element_blank())
          )/(
          fig1[["Eukaryota specific pct"]]  +ggtitle("")+ theme(legend.position="none")+ xlab("")+  
          fig1[["Eukaryota specific mismatches"]] +ggtitle("")+ theme(legend.position="none") + xlab("")+ 
            theme(axis.text.y = element_blank())+ 
          fig1[["Eukaryota specific size"]] +ggtitle("") + xlab("")+ 
            theme(axis.text.y = element_blank())
          ) +plot_layout(heights = c(45, 25))
fig_1



ggsave(plot= fig_1 , filename="figs/fig_pct_sizes_mismatches_all.pdf",
           width = 19 , height = 17, scale=2, units="cm", useDingbats=FALSE)

11.3 Fig. 3 - Example of V4 and V9

legend_mismatches <- cowplot::get_legend( fig3[["8 mismatches"]] + 
                      # create some space to the left of the legend    
                      theme(legend.box.margin = margin(t = 20, r = 0, b = 20, l = 20))
                    )

legend_size_distri <- cowplot::get_legend( fig3[["4 size_distri"]] + 
                      # create some space to the left of the legend    
                      theme(legend.box.margin = margin(t = 20, r = 0, b = 20, l = 20))
                    )


fig_subV4 <- cowplot::plot_grid(fig3[["8 mismatches positions fwd"]] + theme(legend.position="none") + ggtitle("fwd")+ xlab(""), 
                               fig3[["8 mismatches positions rev"]] + theme(legend.position="none") + ggtitle("rev"),
                               nrow = 2,
                               align = "v" )

fig_subV9 <- cowplot::plot_grid(fig3[["27 mismatches positions fwd"]] + theme(legend.position="none") + ggtitle("fwd")+ xlab(""), 
                               fig3[["27 mismatches positions rev"]] + theme(legend.position="none") + ggtitle("rev"),
                               nrow = 2,
                               align = "v" )


fig <- cowplot::plot_grid(  legend_mismatches, 
                            legend_size_distri, 
                            fig3[["8 mismatches"]] + theme(legend.position="none"),
                            fig_subV4, 
                            fig3[["8 size_distri"]] + theme(legend.position="none"), 
                            fig3[["8 size"]],
                            fig3[["27 mismatches"]]+ theme(legend.position="none"),
                            fig_subV9, 
                            fig3[["27 size_distri"]]+ theme(legend.position="none"), 
                            fig3[["27 size"]], 
                   labels = c("","","A - Primer V4 #08","","","", "B - Primer V9 #27","","",""), 
                   label_y = 1.05, hjust = -0.1, scale=0.95,
                   ncol = 2, nrow = 5, rel_heights = c(4,20,20,20,20),
                   align = "h" )

fig

ggsave(plot= fig , filename="figs/fig_examples_V4_V9.pdf",
           width = 20 , height = 25, scale=2.5, units="cm", useDingbats=FALSE)

11.4 Fig. S2 and S5 - Supergroup analysis

row_height = 5

for (specific_one in c("general", "specific")) {
    
    
    fig_3 <- cowplot::plot_grid(fig_supergroup[[str_c("mismatches", specific_one, sep = " ")]], NULL, fig_supergroup[[str_c("size", specific_one, sep = " ")]], 
        NULL, labels = c("A", "", "B", ""), ncol = 2, nrow = 2, align = "v", rel_widths = c(13, 0.2))
    
    print(fig_3)
    
    height <- row_height * (ceiling(n_primers[[specific_one]]/ncol))
    
    ggsave(plot = fig_3, filename = str_c("figs/fig_supergroup_", specific_one, ".pdf"), width = 20, height = height, scale = 1.75, units = "cm", useDingbats = FALSE)
    
    # ggsave(plot= fig_supergroup[[str_c('mismatches', specific_one, sep = ' ')]] , filename=str_c('figs/fig_supergroup_', specific_one, '_A.pdf'),
    # width = 14 , height = height/2, scale=1.75, units='cm', useDingbats=FALSE) ggsave(plot= fig_supergroup[[str_c('size', specific_one, sep = ' ')]]
    # , filename=str_c('figs/fig_supergroup_', specific_one, '_B.pdf'), width = 14 , height = height/2, scale=1.75, units='cm', useDingbats=FALSE)
    
}

11.5 Fig. S6 - Class analysis

row_height = 3.5

for (specific_one in c("general", "specific")) {
    
    fig_4 <- cowplot::plot_grid(fig_class[[str_c("pct", specific_one, sep = " ")]], ncol = 1, nrow = 1, align = "v")
    
    print(fig_4)
    
    height <- row_height * (trunc(n_primers[[specific_one]]/ncol))
    
    ggsave(plot = fig_4, filename = str_c("figs/fig_class_", specific_one, ".pdf"), width = 10, height = height, scale = 3, units = "cm", useDingbats = FALSE)
    
    
}

11.6 Fig 1 - Primer sets position

reorder_primer <- function(x, y) {
    x + 0.01 * (y - x)
}

fig <- ggplot(filter(primer_sets, !is.na(fwd_start))) + geom_segment(aes(x = fwd_start, xend = rev_end, y = forcats::fct_reorder2(primer_set_name, 
    fwd_start, rev_end, reorder_primer), yend = forcats::fct_reorder2(primer_set_name, fwd_start, rev_end, reorder_primer), color = specific), size = 3) + 
    geom_text(aes(x = rev_end + 50, y = forcats::fct_reorder2(primer_set_name, fwd_start, rev_end, reorder_primer), label = str_c(primer_set_id, gene_region, 
        str_replace_all(primer_set_name, c(`_` = " ", V4 = "", V9 = "")), str_replace_na(specificity, ""), sep = " ")), size = 3, hjust = 0) + scale_color_manual(name = "", 
    values = c(general = "gray30", specific = "grey70")) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal", 
    panel.grid.major.y = element_blank(), axis.text.y = element_blank(), axis.line.y = element_blank(), axis.ticks.y = element_blank(), panel.border = element_blank()) + 
    
labs(x = "Position on 18S rRNA", y = "", legend = "") + xlim(0, 2200) + coord_cartesian(clip = "off")

fig

ggsave(plot = fig, filename = str_c("figs/fig_primer_sets_position.pdf"), width = 10, height = 15, scale = 2, units = "cm", useDingbats = FALSE)