Currently there are two major functions under R to assign metabarcoding sequences:
-
AssignTax
from the dada2 package (based on the RDP Naive Bayesian assigner) -
IdTaxa
from the DECIPHER package (Murali, A., Bhargava, A., & Wright, E. S. (2018). IDTAXA: A novel approach for accurate taxonomic classification of microbiome sequences. Microbiome, 6(1), 140.)
IdTaxa
requires first to train the model which can takes
a bit of time depending on the database size. However, after the model
is trained assignment is quite fast.
In this tutorial we explain how to train the model and assign a small metabarcoding dataset. The original code originates from DECIPHER manual
Here we perform the training on a very small set of 100 sequences. Typically each training iteration for the full PR2 database will take about 7 hours on 2.3 GHz PC with 8 processors and 32 G of memory. We would recommend to perform training on a dedicated server. It is available as part of the latest PR2 version.
Load the necessary libraries
- See instructions on the Bioconductor page to install DECIPHER
Define parameters and files
Please refer to DECIPHER manual for information about the different parameters.
file_training = "examples/pr2_version_5.1.0_SSU_dada2.fasta.gz"
file_trained = "examples/pr2_version_5.1.0_SSU.trained.sample.rds"
file_problems = "examples/pr2_version_5.1.0_SSU.problems.sample.rds"
maxGroupSize <- 10 # max sequences per label (>= 1)
allowGroupRemoval <- FALSE
maxIterations <- 3 # must be >= 1
taxo_levels = c("domain", "supergroup", "division", "subdivision", "class", "order", "family", "genus", "species")
Train model
Define function to create taxid
decipher_build_taxid <- function( groups, taxo_levels) {
# Define the ranks and create a parent-child relationship
pr2_ranks <- taxo_levels
pr2_taxonomy <- data.frame(taxo_groups = groups) |>
mutate(taxo_groups = str_replace(taxo_groups, "Root;", "")) |>
tidyr::separate(taxo_groups, sep = ";", into = taxo_levels) |>
dplyr::distinct()
# Process the data and write to a new file
taxid <- pr2_taxonomy %>%
pivot_longer(
cols = all_of(pr2_ranks),
names_to = "rank",
values_to = "name"
) %>%
filter(name != "") %>%
mutate(
ID = row_number(),
parent_name = if_else(rank == pr2_ranks[1], NA, lag(name)),
parent_rank = if_else(rank == pr2_ranks[1], NA, lag(rank))
) %>%
select(
ID,
parent_name,
parent_rank,
name,
rank
)|>
distinct(name, rank, .keep_all = TRUE)
taxid_parents <- taxid |>
select(parent_ID = ID, parent_name = name, parent_rank = rank)
taxid <- left_join(taxid, taxid_parents) |>
mutate(rank_number = match(rank, taxo_levels),
parent_ID = if_else(is.na(parent_ID), 0, parent_ID)) |>
select(Index=ID, Name=name, Parent=parent_ID, Level=rank_number, Rank=rank)
root_taxid <- data.frame(Index=0, Name="Root", Parent = -1, Level=0, Rank= "rootrank")
taxid <- bind_rows(root_taxid, taxid)
return(taxid)
}
Iterations
# Read data -------------------------------------------------------------
seqs <- readDNAStringSet(file_training)
# Sample 1000 sequences ---------------------------------------------------
seqs = seqs[sample(length(seqs), 100)]
# Taxo groups -------------------------------------------------------------
# obtain the taxonomic assignments
groups <- names(seqs) # sequence names
# All taxos must start with "Root;"
groups <- str_c("Root;",groups)
names(seqs)<-groups
groupCounts <- table(groups)
u_groups <- names(groupCounts) # unique groups
cat("Number of groups: ", length(u_groups), '\n') # number of group
#> Number of groups: 742
taxid <- decipher_build_taxid(u_groups, taxo_levels)
# Pruning group size ---
remove <- logical(length(seqs))
for (i in which(groupCounts > maxGroupSize)) {
index <- which(groups==u_groups[i])
keep <- sample(length(index), maxGroupSize)
remove[index[-keep]] <- TRUE
}
cat("Number of sequences eliminated: ", sum(remove), "\n")
#> Number of sequences eliminated: 38
# Iteratively train classifier -----------------------------------------------
probSeqsPrev <- integer() # suspected problem sequences from prior iteration
df.problems <- list()
cat("Number of iterations:", maxIterations, "\n", sep=" ")
#> Number of iterations: 3
for (i in seq_len(maxIterations)) {
cat("Training iteration: ", i, "\n", sep="")
# train the classifier
trainingSet <- LearnTaxa(seqs[!remove], names(seqs)[!remove],taxid)
# look for problem sequences
probSeqs <- trainingSet$problemSequences$Index
cat("Number of problem sequences: ", length(probSeqs), "\n", sep="")
# Exit if no more problem sequences or same problems as previous or reach max Iter
if (length(probSeqs)==0) {
cat("No problem sequences remaining.\n")
break
} else if (length(probSeqs)==length(probSeqsPrev) && all(probSeqsPrev==probSeqs)) {
cat("Iterations converged.\n")
break
}
if (i==maxIterations)
break
# remove any problem sequences
probSeqsPrev <- probSeqs
index <- which(!remove)[probSeqs]
remove[index] <- TRUE # remove all problem sequences
df.problems[[i]] <- data.frame(index, trainingSet$problemSequences)
if (!allowGroupRemoval) {
# replace any removed groups
missing <- !(u_groups %in% groups[!remove])
missing <- u_groups[missing]
if (length(missing) > 0) {
index <- index[groups[index] %in% missing]
remove[index] <- FALSE # don't remove
}
}
}
#> Training iteration: 1
#> ================================================================================
#>
#> Time difference of 68.25 secs
#>
#> Number of problem sequences: 1
#> Training iteration: 2
#> ================================================================================
#>
#> Time difference of 122.55 secs
#>
#> Number of problem sequences: 1
#> Iterations converged.
Save problematic sequences
Problematic sequences are sequences that are not assigned to the group they should belong to during the training process. They are removed for the next iteration. They point out to incoherences in the reference database.
cat("Total number of sequences eliminated: ", sum(remove), "\n")
cat("Number of remaining problem sequences: ", length(probSeqs), "\n")
df.problems <- reduce(df.problems, bind_rows) %>%
select(-Index)
saveRDS(df.problems, file_problems)
DT::datatable(head(df.problems, 20), width = 800, caption = "First 20 problematic sequences.")
Save training set
saveRDS(trainingSet,file_trained)
Assign small metabarcoding data set
Files
file_to_assign = "examples/Singapore ASV_sample.fasta"
file_to_assign_xlsx = "examples/Singapore ASV_sample.xlsx"
file_assigned = "examples/Singapore ASV_sample.decipher.5.1.0.rds"
file_assigned_xlsx = "examples/Singapore ASV_sample.decipher.5.1.0.xlsx"
Assign
# Read training set
trainingSet <- readRDS(file_trained)
# Read sequences to assign
asv_to_assign <- import(file_to_assign_xlsx)
seq <- readDNAStringSet(file_to_assign)
# Get the taxonomy from the training set
ids <- IdTaxa(seq,
trainingSet,
type="extended",
strand="top",
threshold=0)
ids
#> ================================================================================
#>
#> Time difference of 1.5 secs
ids
# A test set of class 'Taxa' with length 99
# confidence name taxon
# [1] 12% asv_0004 Root; Bacteria; PANNAM; Proteobacteria; Proteobacteria_X; Gammaproteobacteria; Alteromonadales; Psychromonadaceae; Psychromonas; Psychromonas_sp.
# [2] 8% asv_2263 Root; Eukaryota; TSAR; Alveolata; Dinoflagellata; Syndiniales; Dino-Group-I; Dino-Group-I-Clade-1; Dino-Group-I-Clade-1_X; Dino-Group-I-Clade-1_X_sp.
# [3] 21% asv_0664 Root; Eukaryota; TSAR; Alveolata; Dinoflagellata; Syndiniales; Dino-Group-II; Dino-Group-II-Clade-20; Dino-Group-II-Clade-20_X; Dino-Group-II-Clade-20_X_sp.
# [4] 59% asv_1095 Root; Bacteria; PANNAM; Proteobacteria; Proteobacteria_X; Alphaproteobacteria; Rhodobacterales; Rhodobacteraceae; Rhodobacter; Rhodobacter_sp.
# [5] 26% asv_0953 Root; Bacteria; PANNAM; Proteobacteria; Proteobacteria_X; Gammaproteobacteria; Alteromonadales; Psychromonadaceae; Psychromonas; Psychromonas_sp.
# ... ... ... ...
# [95] 17% asv_2793 Root; Eukaryota; TSAR; Stramenopiles; Gyrista; Gyrista_X; Gyrista_XX; MAST-1; MAST-1C; MAST-1C_sp.
# [96] 10% asv_2348 Root; Bacteria; PANNAM; Proteobacteria; Proteobacteria_X; Gammaproteobacteria; Alteromonadales; Psychromonadaceae; Psychromonas; Psychromonas_sp.
# [97] 15% asv_1507 Root; Eukaryota; TSAR; Alveolata; Dinoflagellata; Syndiniales; Dino-Group-II; Dino-Group-II-Clade-20; Dino-Group-II-Clade-20_X; Dino-Group-II-Clade-20_X_sp.
# [98] 3% asv_1151 Root; Eukaryota; Obazoa; Opisthokonta; Metazoa; Arthropoda; Crustacea; Malacostraca; Linuparus; Linuparus_trigonus
# [99] 14% asv_2741 Root; Bacteria; PANNAM; Proteobacteria; Proteobacteria_X; Alphaproteobacteria; Rhodobacterales; Rhodobacteraceae; Rhodobacter; Rhodobacter_sp.
>
# Transform to a dataframe
# Note: ids are provided as a list so we need to transform to dataframe
n_seq <- length(ids)
df_rows <- list()
# Go through all the elements of the list
for(i in 1:n_seq){
seq_name <- names(ids[i])
taxonomy<- ids[[i]]$taxon
confidence <- ids[[i]]$confidence
df_rows[[i]] = data.frame(seq_name, taxonomy, confidence, taxo_level=c("Root", taxo_levels))
}
df <- reduce(df_rows, bind_rows) %>%
filter(taxo_level !="Root") %>%
pivot_wider(names_from = taxo_level, values_from = c(taxonomy, confidence))
# Save to file
saveRDS(df, file_assigned)
# Merge with original ASV file
asv_assigned <- left_join(asv_to_assign, df) %>%
relocate(sequence, .after = last_col())
# Save as xlsx file
export(asv_assigned, file_assigned_xlsx)
# Display first 20 ASVs
DT::datatable(head(asv_assigned, 20), width = 800, caption = "First 20 ASV reassigned.")