Currently there are two major functions under R to assign metabarcoding sequences:
- AssignTax from the dada2 package (based on the RDP Naive Bayesian assigner)
- IDTax from the DECIPHER package (Murali, A., Bhargava, A., & Wright, E. S. (2018). IDTAXA: A novel approach for accurate taxonomic classification of microbiome sequences. Microbiome, 6(1), 140.)
IDTax requires first to train the model which can takes a bit of time depending on the database size. However, after the model is trained assignement is quite fast.
In this tutorial we explain how to train the model and assign a small metabarcoding dataset. The original code originates from DECIPHER manual
Here we perform the training on a very small set of 100 sequences. Typically each training iteration for the full PR2 database will take about 7 hours on 2.3 GHz PC with 8 processors and 32 G of memory. In contrast on the ABIMS Roscoff server using 32G and 1 processor, one iteration took 20 min. The trained dataset is about 300 Mb. It is available as part of the latest PR2 version.
Load the necessary libraries
- See instructions on the Bioconductor page to install DECIPHER
Define parameters and files
Please refer to DECIPHER manual for information about the different parameters.
file_training = "examples/pr2_version_4.14.0_SSU_dada2.fasta.gz"
file_trained = "examples/pr2_version_4.14.0_SSU.trained.sample.rds"
file_problems = "examples/pr2_version_4.14.0_SSU.problems.sample.rds"
maxGroupSize <- 10 # max sequences per label (>= 1)
allowGroupRemoval <- FALSE
maxIterations <- 3 # must be >= 1
Train model
Iterations
# Read data -------------------------------------------------------------
seqs <- readDNAStringSet(file_training)
# Sample 1000 sequences ---------------------------------------------------
seqs = seqs[sample(length(seqs), 1000)]
# Taxo groups -------------------------------------------------------------
# obtain the taxonomic assignments
groups <- names(seqs) # sequence names
# All taxos must start with "Root;"
groups <- str_c("Root;",groups)
names(seqs)<-groups
groupCounts <- table(groups)
u_groups <- names(groupCounts) # unique groups
cat("Number of groups: ", length(u_groups), '\n') # number of group
#> Number of groups: 742
taxid <- NULL
# Pruning group size ---
remove <- logical(length(seqs))
for (i in which(groupCounts > maxGroupSize)) {
index <- which(groups==u_groups[i])
keep <- sample(length(index), maxGroupSize)
remove[index[-keep]] <- TRUE
}
cat("Number of sequences eliminated: ", sum(remove), "\n")
#> Number of sequences eliminated: 38
# Iteratively train classifier -----------------------------------------------
probSeqsPrev <- integer() # suspected problem sequences from prior iteration
df.problems <- list()
cat("Number of iterations:", maxIterations, "\n", sep=" ")
#> Number of iterations: 3
for (i in seq_len(maxIterations)) {
cat("Training iteration: ", i, "\n", sep="")
# train the classifier
trainingSet <- LearnTaxa(seqs[!remove], names(seqs)[!remove],taxid)
# look for problem sequences
probSeqs <- trainingSet$problemSequences$Index
cat("Number of problem sequences: ", length(probSeqs), "\n", sep="")
# Exit if no more problem sequences or same problems as previous or reach max Iter
if (length(probSeqs)==0) {
cat("No problem sequences remaining.\n")
break
} else if (length(probSeqs)==length(probSeqsPrev) && all(probSeqsPrev==probSeqs)) {
cat("Iterations converged.\n")
break
}
if (i==maxIterations)
break
# remove any problem sequences
probSeqsPrev <- probSeqs
index <- which(!remove)[probSeqs]
remove[index] <- TRUE # remove all problem sequences
df.problems[[i]] <- data.frame(index, trainingSet$problemSequences)
if (!allowGroupRemoval) {
# replace any removed groups
missing <- !(u_groups %in% groups[!remove])
missing <- u_groups[missing]
if (length(missing) > 0) {
index <- index[groups[index] %in% missing]
remove[index] <- FALSE # don't remove
}
}
}
#> Training iteration: 1
#> ================================================================================
#>
#> Time difference of 976.07 secs
#>
#> Number of problem sequences: 9
#> Training iteration: 2
#> ================================================================================
#>
#> Time difference of 846.92 secs
#>
#> Number of problem sequences: 5
#> Training iteration: 3
#> ================================================================================
#>
#> Time difference of 882.58 secs
#>
#> Number of problem sequences: 5
Save problematic sequences
Problematic sequences are sequences that are not assigned to the group they should belong to during the training process. They are removed for the next iteration. They point out to incoherences in the reference database.
cat("Total number of sequences eliminated: ", sum(remove), "\n")
cat("Number of remaining problem sequences: ", length(probSeqs), "\n")
df.problems <- reduce(df.problems, bind_rows) %>%
select(-Index)
saveRDS(df.problems, file_problems)
DT::datatable(head(df.problems, 20), width = 800, caption = "First 20 problematic sequences.")
Save training set
saveRDS(trainingSet,file_trained)
Assign small metabarcoding data set
Files
taxo_levels <- c("kingdom", "supergroup", "division", "class", "order", "family", "genus", "species")
file_to_assign = "examples/Singapore ASV_sample.fasta"
file_to_assign_xlsx = "examples/Singapore ASV_sample.xlsx"
file_assigned = "examples/Singapore ASV_sample.decipher.4.14.0.rds"
file_assigned_xlsx = "examples/Singapore ASV_sample.decipher.4.14.0.xlsx"
Assign
# Read training set
trainingSet <- readRDS(file_trained)
# Read sequences to assign
asv_to_assign <- import(file_to_assign_xlsx)
seq <- readDNAStringSet(file_to_assign)
# Get the taxonomy from the training set
ids <- IdTaxa(seq,
trainingSet,
type="extended",
strand="top",
threshold=0)
ids
#> ================================================================================
#>
#> Time difference of 7.47 secs
ids
#> A test set of class 'Taxa' with length 99
#> confidence name taxon
#> [1] 6% asv_0004 Root; Bacteria; Bacteria_X; Proteobacter...
#> [2] 50% asv_2263 Root; Eukaryota; Alveolata; Dinoflagella...
#> [3] 36% asv_0664 Root; Eukaryota; Alveolata; Dinoflagella...
#> [4] 26% asv_1095 Root; Bacteria; Bacteria_X; Proteobacter...
#> [5] 24% asv_0953 Root; Bacteria; Bacteria_X; Proteobacter...
#> ... ... ... ...
#> [95] 13% asv_2793 Root; Eukaryota; Stramenopiles; Pseudofu...
#> [96] 3% asv_2348 Root; Bacteria; Bacteria_X; Proteobacter...
#> [97] 56% asv_1507 Root; Eukaryota; Alveolata; Dinoflagella...
#> [98] 2% asv_1151 Root; Eukaryota; Opisthokonta; Metazoa; ...
#> [99] 13% asv_2741 Root; Bacteria; Bacteria_X; Proteobacter...
# Transform to a dataframe
# Note: ids are provided as a list so we need to transform to dataframe
# Transform to a dataframe
# Note: ids are provided as a list so we need to transform to dataframe
n_seq <- length(ids)
df_rows <- list()
# Go through all the elements of the list
for(i in 1:n_seq){
seq_name <- names(ids[i])
taxonomy<- ids[[i]]$taxon
confidence <- ids[[i]]$confidence
df_rows[[i]] = data.frame(seq_name, taxonomy, confidence, taxo_level=c("Root", taxo_levels))
}
df <- reduce(df_rows, bind_rows) %>%
filter(taxo_level !="Root") %>%
pivot_wider(names_from = taxo_level, values_from = c(taxonomy, confidence))
# Save to file
saveRDS(df, file_assigned)
# Merge with original ASV file
asv_assigned <- left_join(asv_to_assign, df) %>%
relocate(sequence, .after = last_col())
# Save as xlsx file
export(asv_assigned, file_assigned_xlsx)
# Display first 20 ASVs
DT::datatable(head(asv_assigned, 20), width = 800, caption = "First 20 ASV reassigned.")