Train DECIPHER with PR2

Currently there are two major functions under R to assign metabarcoding sequences:

AssignTax from the dada2 package (based on the RDP Naive Bayesian assigner)
IdTaxa from the DECIPHER package (Murali, A., Bhargava, A., & Wright, E. S. (2018). IDTAXA: A novel approach for accurate taxonomic classification of microbiome sequences. Microbiome, 6(1), 140.)

IdTaxa requires first to train the model which can takes a bit of time depending on the database size. However, after the model is trained assignment is quite fast.

In this tutorial we explain how to train the model and assign a small metabarcoding dataset. The original code originates from DECIPHER manual

Here we perform the training on a very small set of 100 sequences. Typically each training iteration for the full PR2 database will take about 7 hours on 2.3 GHz PC with 8 processors and 32 G of memory. We would recommend to perform training on a dedicated server. It is available as part of the latest PR2 version.

Load the necessary libraries

See instructions on the Bioconductor page to install DECIPHER


library(DECIPHER)
library(stringr)
library(dplyr)
library(tidyr)
library(purrr)
library(stringr)
library(rio)
library(DT)

Define parameters and files

Please refer to DECIPHER manual for information about the different parameters.

file_training = "examples/pr2_version_5.1.0_SSU_dada2.fasta.gz"
file_trained = "examples/pr2_version_5.1.0_SSU.trained.sample.rds"
file_problems = "examples/pr2_version_5.1.0_SSU.problems.sample.rds"

maxGroupSize <- 10 # max sequences per label (>= 1)
allowGroupRemoval <- FALSE

maxIterations <- 3 # must be >= 1

taxo_levels = c("domain", "supergroup", "division", "subdivision", "class", "order", "family", "genus", "species")

Train model

Define function to create taxid


decipher_build_taxid <- function( groups, taxo_levels) {

  # Define the ranks and create a parent-child relationship
  pr2_ranks <- taxo_levels

  pr2_taxonomy <- data.frame(taxo_groups = groups) |>
    mutate(taxo_groups = str_replace(taxo_groups, "Root;", "")) |>
    tidyr::separate(taxo_groups, sep = ";", into = taxo_levels) |>
    dplyr::distinct()


 # Process the data and write to a new file
  taxid <- pr2_taxonomy %>%
    pivot_longer(
      cols = all_of(pr2_ranks),
      names_to = "rank",
      values_to = "name"
    )  %>%
    filter(name != "") %>%
    mutate(
      ID = row_number(),
      parent_name = if_else(rank == pr2_ranks[1], NA, lag(name)),
      parent_rank = if_else(rank == pr2_ranks[1], NA, lag(rank))
    ) %>%
    select(
      ID,
      parent_name,
      parent_rank,
      name,
      rank
    )|> 
    distinct(name, rank, .keep_all = TRUE)

  taxid_parents <- taxid |>
    select(parent_ID = ID, parent_name = name, parent_rank = rank)

  taxid <- left_join(taxid, taxid_parents) |>
    mutate(rank_number = match(rank, taxo_levels),
           parent_ID = if_else(is.na(parent_ID), 0, parent_ID)) |>
    select(Index=ID, Name=name, Parent=parent_ID, Level=rank_number, Rank=rank) 
    
  root_taxid <- data.frame(Index=0, Name="Root", Parent = -1, Level=0, Rank= "rootrank")
  taxid <- bind_rows(root_taxid, taxid)
  
  return(taxid)
}

Iterations

# Read data -------------------------------------------------------------

seqs <- readDNAStringSet(file_training)

# Sample 1000 sequences  ---------------------------------------------------
seqs = seqs[sample(length(seqs), 100)]


# Taxo groups -------------------------------------------------------------

# obtain the taxonomic assignments
groups <- names(seqs) # sequence names

# All taxos must start with "Root;"
groups <- str_c("Root;",groups)
names(seqs)<-groups

groupCounts <- table(groups)
u_groups <- names(groupCounts) # unique groups
cat("Number of groups: ", length(u_groups), '\n') # number of group
#> Number of groups:  742

taxid <- decipher_build_taxid(u_groups, taxo_levels)

# Pruning group size ---

remove <- logical(length(seqs))

for (i in which(groupCounts > maxGroupSize)) {
  index <- which(groups==u_groups[i])
  keep <- sample(length(index), maxGroupSize)
  remove[index[-keep]] <- TRUE
}

cat("Number of sequences eliminated: ", sum(remove), "\n")
#> Number of sequences eliminated:  38

# Iteratively train classifier -----------------------------------------------

probSeqsPrev <- integer() # suspected problem sequences from prior iteration
df.problems <- list()

cat("Number of iterations:", maxIterations, "\n", sep=" ")
#> Number of iterations: 3

for (i in seq_len(maxIterations)) {
  
  cat("Training iteration: ", i, "\n", sep="")
  
  # train the classifier
  trainingSet <- LearnTaxa(seqs[!remove], names(seqs)[!remove],taxid)
  
  # look for problem sequences
  probSeqs <- trainingSet$problemSequences$Index
  
  cat("Number of problem sequences: ", length(probSeqs), "\n", sep="")


  # Exit if no more problem sequences or same problems as previous or reach max Iter
  
  if (length(probSeqs)==0) {
    cat("No problem sequences remaining.\n")
    break
  } else if (length(probSeqs)==length(probSeqsPrev) && all(probSeqsPrev==probSeqs)) {
    cat("Iterations converged.\n")
    break
  }
  
  if (i==maxIterations)
    break

  
  # remove any problem sequences

  probSeqsPrev <- probSeqs
  
  index <- which(!remove)[probSeqs]
  remove[index] <- TRUE # remove all problem sequences
  
  df.problems[[i]] <- data.frame(index, trainingSet$problemSequences)
  
  if (!allowGroupRemoval) {
    # replace any removed groups
    missing <- !(u_groups %in% groups[!remove])
    missing <- u_groups[missing]
    if (length(missing) > 0) {
      index <- index[groups[index] %in% missing]
      remove[index] <- FALSE # don't remove
    }
  }
}

#> Training iteration: 1
#> ================================================================================
#> 
#> Time difference of 68.25 secs
#> 
#> Number of problem sequences: 1
#> Training iteration: 2
#> ================================================================================
#> 
#> Time difference of 122.55 secs
#> 
#> Number of problem sequences: 1
#> Iterations converged.

Save problematic sequences

Problematic sequences are sequences that are not assigned to the group they should belong to during the training process. They are removed for the next iteration. They point out to incoherences in the reference database.

cat("Total number of sequences eliminated: ", sum(remove), "\n") 
cat("Number of remaining problem sequences: ", length(probSeqs), "\n")

df.problems <- reduce(df.problems, bind_rows) %>% 
  select(-Index)

saveRDS(df.problems, file_problems)


DT::datatable(head(df.problems, 20), width = 800, caption = "First 20 problematic sequences.")

Save training set

saveRDS(trainingSet,file_trained)

Assign small metabarcoding data set

Files


file_to_assign = "examples/Singapore ASV_sample.fasta"
file_to_assign_xlsx = "examples/Singapore ASV_sample.xlsx"
file_assigned = "examples/Singapore ASV_sample.decipher.5.1.0.rds"
file_assigned_xlsx = "examples/Singapore ASV_sample.decipher.5.1.0.xlsx"

Assign


# Read training set 
  trainingSet <- readRDS(file_trained)

# Read sequences to assign
  asv_to_assign <- import(file_to_assign_xlsx)
  seq <- readDNAStringSet(file_to_assign)

# Get the taxonomy from the training set

  ids <- IdTaxa(seq,
                trainingSet,
                type="extended",
                strand="top",
                threshold=0)
  
  ids
  
#> ================================================================================
#> 
#> Time difference of 1.5 secs
  
  ids

#   A test set of class 'Taxa' with length 99
#      confidence name                 taxon
#  [1]        12% asv_0004             Root; Bacteria; PANNAM; Proteobacteria; Proteobacteria_X; Gammaproteobacteria; Alteromonadales; Psychromonadaceae; Psychromonas; Psychromonas_sp.                           
#  [2]         8% asv_2263             Root; Eukaryota; TSAR; Alveolata; Dinoflagellata; Syndiniales; Dino-Group-I; Dino-Group-I-Clade-1; Dino-Group-I-Clade-1_X; Dino-Group-I-Clade-1_X_sp.                       
#  [3]        21% asv_0664             Root; Eukaryota; TSAR; Alveolata; Dinoflagellata; Syndiniales; Dino-Group-II; Dino-Group-II-Clade-20; Dino-Group-II-Clade-20_X; Dino-Group-II-Clade-20_X_sp.                
#  [4]        59% asv_1095             Root; Bacteria; PANNAM; Proteobacteria; Proteobacteria_X; Alphaproteobacteria; Rhodobacterales; Rhodobacteraceae; Rhodobacter; Rhodobacter_sp.                              
#  [5]        26% asv_0953             Root; Bacteria; PANNAM; Proteobacteria; Proteobacteria_X; Gammaproteobacteria; Alteromonadales; Psychromonadaceae; Psychromonas; Psychromonas_sp.                           
#  ...        ... ...                  ...
# [95]        17% asv_2793             Root; Eukaryota; TSAR; Stramenopiles; Gyrista; Gyrista_X; Gyrista_XX; MAST-1; MAST-1C; MAST-1C_sp.                                                                          
# [96]        10% asv_2348             Root; Bacteria; PANNAM; Proteobacteria; Proteobacteria_X; Gammaproteobacteria; Alteromonadales; Psychromonadaceae; Psychromonas; Psychromonas_sp.                           
# [97]        15% asv_1507             Root; Eukaryota; TSAR; Alveolata; Dinoflagellata; Syndiniales; Dino-Group-II; Dino-Group-II-Clade-20; Dino-Group-II-Clade-20_X; Dino-Group-II-Clade-20_X_sp.                
# [98]         3% asv_1151             Root; Eukaryota; Obazoa; Opisthokonta; Metazoa; Arthropoda; Crustacea; Malacostraca; Linuparus; Linuparus_trigonus                                                          
# [99]        14% asv_2741             Root; Bacteria; PANNAM; Proteobacteria; Proteobacteria_X; Alphaproteobacteria; Rhodobacterales; Rhodobacteraceae; Rhodobacter; Rhodobacter_sp.                              
>


# Transform to a dataframe
# Note: ids are provided as a list so we need to transform to dataframe

  n_seq <- length(ids)
  df_rows <- list()

# Go through all the elements of the list

for(i in 1:n_seq){
  seq_name <- names(ids[i])
  taxonomy<- ids[[i]]$taxon
  confidence <- ids[[i]]$confidence
  df_rows[[i]] = data.frame(seq_name, taxonomy, confidence, taxo_level=c("Root", taxo_levels))
}

df <- reduce(df_rows, bind_rows) %>% 
  filter(taxo_level !="Root") %>% 
  pivot_wider(names_from = taxo_level, values_from = c(taxonomy, confidence))

# Save to file
  saveRDS(df, file_assigned)

# Merge with original ASV file
  asv_assigned <- left_join(asv_to_assign, df) %>% 
    relocate(sequence, .after = last_col())

# Save as xlsx file
  export(asv_assigned, file_assigned_xlsx)

# Display first 20 ASVs
  DT::datatable(head(asv_assigned, 20), width = 800, caption = "First 20 ASV reassigned.")

Daniel Vaulot