PR2 version 4.13.0
Adding Silva sequences
PR2 version 4.13.0
Adding Silva sequences
1 Init
Load the variables common to the different scripts and the necessary libraries
2 Read pr2
- Use the PR2 from Google database
# Info for the Google dabase
pr2_db <- db_info("pr2_google")
pr2_db_con <- db_connect(pr2_db)
pr2_main <- tbl(pr2_db_con, "pr2_main") %>%
collect()
# pr2_seq <- tbl(pr2_db_con, "pr2_sequences") %>%
# collect()
pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy") %>%
filter (is.na(taxo_removed_version)) %>%
collect()
pr2_metadata <- tbl(pr2_db_con, "pr2_metadata") %>%
collect()
# Join the tables and keep only sequences that are not removed
pr2 <- pr2_main %>%
left_join(pr2_taxo, by = c("species"="species")) %>%
# left_join (pr2_seq) %>%
left_join (pr2_metadata)
db_disconnect(pr2_db_con)
pr2_18S <- pr2 %>%
filter (is.na(removed_version)) %>%
filter(gene == "18S_rRNA") %>%
filter(!is.na(species)) %>%
filter(!is.na(kingdom))
pr2_18S_no_taxo <- pr2 %>%
filter(gene == "18S_rRNA") %>%
filter(is.na(species))
pr2_18S_removed <- pr2 %>% filter (!is.na(removed_version)) %>%
filter(gene == "18S_rRNA")%>%
filter(!is.na(species))
# Remove pr2 to make sure that we do not use the full pr2
# remove(pr2)
pr2_18S %>% filter(is.na(kingdom))
# pr2_export(pr2_18S, file_name=pr2.env$file_18S_fasta_dada2, file_type="fasta", file_format="dada2")
3 Set up the files
dir_pr2_update <- "../updates/done - 2020 4.13.0 Silva"
pr2.env$editor <- "D. Vaulot"
full_path <- function(file_name){str_c(dir_pr2_update,"/", file_name)}
# file_fasta <- full_path("arb-silva.de_2020-05-18_id824681_tax_silva samples.fasta")
file_fasta <- full_path("arb-silva.de_2020-05-18_id824681_tax_silva.fasta.gz")
# create the directory for taxonomy output
dir.create(full_path("taxo"), showWarnings = FALSE)
4 Read fasta file
pr2_update <- Biostrings::readRNAStringSet(file_fasta)
# Convert to DNA
pr2_update <- Biostrings::DNAStringSet(pr2_update)
# Convert to Data frame
pr2_update <- data.frame(sequence=pr2_update) %>%
rownames_to_column(var = "seq_name") %>%
tidyr::separate(seq_name, c("genbank_accession", "start", "end", "silva_taxonomy"), sep = "[. ]", extra = "merge") %>%
mutate(pr2_accession = str_c(genbank_accession,".",start,".", end,"_U"))
4.1 Save to xlsx for import into pr2
4.2 Check sequences from PR2 that are assigned that have the same start-end than Silva
- PR2 sequences with species: 193084
- PR2 sequences with species and silva taxonomy based on pr2_accession: 162055
4.3 Compare with sequences in PR2
str_c("Number of updated sequences that are not present in PR2 : ",
nrow(filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))))
str_c("Number of updated sequences that are not active in PR2 : ",
nrow(filter(pr2_update, (genbank_accession %in% pr2_18S_removed$genbank_accession))))
str_c("Sequences duplicated - df ")
pr2_update %>%
count(genbank_accession) %>%
filter(n > 1)
5 Old sequences
6 New sequences
6.1 Filter new sequences
pr2_new <- pr2_update %>%
filter(!(genbank_accession %in% pr2$genbank_accession)) %>%
mutate(sequence_length = str_length(sequence),
ambiguities = str_count(sequence,pattern=pr2.env$ambig_regex) ) %>%
filter((sequence_length >= pr2.env$sequence_length_min) &
(ambiguities <= pr2.env$ambiguities_max)) %>%
filter(start < 20000) # to avoid too big sequences from genomes
pr2_genbank_new <- pr2_new %>%
select(genbank_accession)
6.2 Save the list of new sequences
6.3 Get some statistics
6.4 Alveolata
silva_alveolata <- pr2_new %>%
filter(str_detect(silva_taxonomy, "Alveolata")) %>%
mutate(seq_name = str_c(genbank_accession,".", start, ".", end, " ", silva_taxonomy))
cat ("Number of Alveolata sequences", nrow(silva_alveolata))
fasta_write(silva_alveolata, full_path("silva_alveolata.fasta.gz"), compress=TRUE, taxo_include = FALSE)
6.5 Run the script PR2 genbank download.R in background
6.6 Read the metadata files and join by genbank accession
metafile <- list()
for (i in c(1:300)) {
file_name = full_path(str_c("metadata_01/pr2_gb_metadata_",i,".txt"))
if(file.exists(file_name)) {
metafile[[i]] <- read_tsv(file_name,
col_types = cols(gb_collection_date = col_character(),
gb_strain = col_character(),
gb_isolate = col_character()),
guess_max = 10000)
}
}
for (i in c(1:70)) {
file_name = full_path(str_c("metadata_02/pr2_gb_metadata_2_",i,".txt"))
if(file.exists(file_name)) {
metafile[[i+300]] <- read_tsv(file_name,
col_types = cols(gb_collection_date = col_character(),
gb_strain = col_character(),
gb_isolate = col_character()),
guess_max = 10000)
}
}
for (i in c(1:100)) {
file_name = full_path(str_c("metadata_05/pr2_gb_metadata_5_",i,".txt"))
if(file.exists(file_name)) {
metafile[[i+370]] <- read_tsv(file_name,
col_types = cols(gb_collection_date = col_character(),
gb_strain = col_character(),
gb_isolate = col_character()),
guess_max = 10000)
}
}
# The first file serves as a template for the following one when binding rows
gb_metadata <- reduce(metafile, bind_rows)%>%
filter(!is.na(gb_organism)) %>% # Remove entry that do not exist
select(-sequence) %>%
distinct()
pr2_new <- left_join(pr2_new, gb_metadata, by = c("genbank_accession" = "genbank_accession"))
7 Finalization
7.1 Add fields
pr2_new <- pr2_new %>%
# mutate(start=1, end=sequence_length,
# label = case_when (intron == "intron" ~ "G",
# TRUE ~"U")) %>%
mutate(#sequence=str_sub(gb_sequence,start,end),
# sequence_length = str_length(sequence),
label = "U",
pr2_accession = str_c(genbank_accession,".",start,".", end,"_",label),
reference_sequence = NA,
gene=case_when(str_detect(gb_definition, "nucleomorph") ~ "18S_rRNA",
str_detect(gb_definition, "chloroplast") ~ "16S_rRNA",
TRUE ~ "18S_rRNA"),
organelle = case_when(str_detect(gb_definition, "nucleomorph") ~ "nucleomorph",
str_detect(gb_definition, "chloroplast") ~ "plastid",
TRUE ~ "nucleus"),
ambiguities = str_count(sequence,pattern=pr2.env$ambig_regex)
)
pr2_new$sequence_hash = purrr::map_chr(pr2_new$sequence,digest::sha1)
7.2 Remove sequences with ambiguities or too short
filter(pr2_new, (sequence_length < pr2.env$sequence_length_min)|(ambiguities > pr2.env$ambiguities_max)) %>%
arrange(desc(sequence_length)) %>%
select(pr2_accession, sequence_length,ambiguities, gb_definition)
pr2_new<- pr2_new %>%
filter((sequence_length >= pr2.env$sequence_length_min) & (ambiguities <= pr2.env$ambiguities_max))
7.3 Check duplicate entries
7.4 Final files for uploading
Final number of sequences: 333,247
sprintf(" Final number of sequences added: %d", nrow(pr2_new))
colnames(pr2_new)
pr2_update_new <- pr2_new
pr2_main_new <- pr2_update_new %>%
select (pr2_accession, genbank_accession, start,end, label, organelle, gene) %>%
mutate(added_version = pr2.env$version, edited_by = pr2.env$editor)
nrow(pr2_main_new)
pr2_sequences_new <- pr2_update_new %>%
select(pr2_accession, sequence, sequence_length, ambiguities, sequence_hash)
nrow(pr2_sequences_new)
pr2_metadata_new <- pr2_update_new %>%
filter(!is.na(gb_definition)) %>%
select(genbank_accession, starts_with("gb_"),starts_with("pr2_"), starts_with("eukref_"), starts_with("silva_")) %>%
select( -pr2_accession) %>%
distinct()
nrow(pr2_metadata_new)
7.5 Save everything to an Excel file
file_pr2_imports <- full_path("pr2_imports_final.xlsx")
onglets <- list(# "pr2_taxo_updated" = pr2_taxo_updated,
# "pr2_taxo_remove" = pr2_taxo_remove,
#"pr2_main_old" = pr2_main_old,
"pr2_metadata_old" = pr2_metadata_old,
"pr2_main_new" = pr2_main_new,
"pr2_sequence_new" = pr2_sequences_new,
"pr2_metadata_new" = pr2_metadata_new)
# write.xlsx(onglets, file_pr2_imports, zoom = 90, firstRow = TRUE, firstCol = TRUE)
write.xlsx(pr2_metadata_old, full_path("pr2_imports_metadata_old.xlsx"), zoom = 90, firstRow = TRUE, firstCol = TRUE)
write.xlsx(pr2_main_new, full_path("pr2_imports_main_new.xlsx"), zoom = 90, firstRow = TRUE, firstCol = TRUE)
write.xlsx(pr2_sequences_new, full_path("pr2_imports_sequences_new.xlsx"), zoom = 90, firstRow = TRUE, firstCol = TRUE)
write.xlsx(pr2_metadata_new, full_path("pr2_imports_metadata_new.xlsx"), zoom = 90, firstRow = TRUE, firstCol = TRUE)
8 One more round of search for metadata
Note: these entries correspond to contigs from metagenomes that point to a single record. They have to be searched one by one (script #6)
pr2_new_6 <- pr2 %>%
filter(is.na(pr2_metadata_id)) %>%
# slice_head(n = 200) %>%
select(pr2_accession, genbank_accession)
pr2_genbank_new_6 <- pr2_new_6 %>%
select(genbank_accession)
# saveRDS(pr2_genbank_new_6, full_path("pr2_genbank_new_6.rds"))
8.1 Run the script on server (script_genbank_06.R)
- Note that some of the pr2 entries correspond to only one GenBank entries for metagenomes. This has to be changed in the pr2_main table
8.2 Read the metadata files and join by genbank accession
metafile <- list()
for (i in c(1:500)) {
file_name = full_path(str_c("metadata_06/pr2_gb_metadata_6_",i,".txt"))
if(file.exists(file_name)) {
metafile[[i]] <- read_tsv(file_name,
col_types = cols(gb_collection_date = col_character(),
gb_strain = col_character(),
gb_isolate = col_character(),
gb_project_id = col_character(),
gb_specimen_voucher = col_character(),
gb_clone = col_character() ),
guess_max = 10000)
}
}
# The first file serves as a template for the following one when binding rows
gb_metadata <- reduce(metafile, bind_rows) %>%
filter(!is.na(genbank_accession_new)) %>% # Remove all entries for which we could not find a GenBank entry
distinct()
write.xlsx(gb_metadata, full_path(str_c("metadata_06/pr2_gb_metadata_6.xlsx")))
# Some entries have the same genbank_accession and different pr2_accession
junk <- gb_metadata %>%
count(genbank_accession_old) %>%
filter(n> 1) %>%
arrange(desc(n))
pr2_new_6_metadata <- left_join(pr2_new_6, gb_metadata, by = c("genbank_accession" = "genbank_accession_old")) %>%
rename(genbank_accession_old = genbank_accession) %>%
filter(!is.na(genbank_accession_new)) # Remove all entries for which we could not find a GenBank entry
pr2_main_6 <- pr2_new_6_metadata %>%
select(pr2_accession, genbank_accession_new) %>%
rename(genbank_accession = genbank_accession_new)
# Some entries have the same genbank_accession and different pr2_accession
junk <- pr2_main_6 %>%
count(genbank_accession) %>%
filter(n> 1) %>%
arrange(desc(n))
pr2_metadata_6 <- pr2_new_6_metadata %>%
select(-pr2_accession, -genbank_accession_old) %>%
rename(genbank_accession = genbank_accession_new) %>%
filter(!(genbank_accession %in% pr2_metadata$genbank_accession)) %>%
distinct()
8.3 Save everything to an Excel file
- pr2_main: 70668 entries corresponding to 55512 updates. replace the old genbank_accession by the new one. Some pr2_accession correspond to the same genbank entry for metagenomes, e.g. OBEP000000000 corresponding to 44921 silva sequences.
- pr2_medata: 14247 entries
9 Get taxonomy of Silva and Genbank sequences from hash value
9.1 First iteration
By mistake I included all sequences including those removed…
pr2_hash <- pr2 %>%
count(sequence_hash) %>%
filter(n > 1)
pr2_hash <- pr2 %>%
select(pr2_accession, species, sequence_hash) %>%
filter(sequence_hash %in% pr2_hash$sequence_hash) %>%
arrange(sequence_hash)
pr2_hash_no_species <- pr2_hash %>%
filter(is.na(species)) %>%
select(-species)
pr2_hash_species <- pr2_hash %>%
filter(!is.na(species)) %>%
select(-pr2_accession) %>%
distinct()
# Check whether species with same hash have same species
pr2_hash_species %>%
distinct(species, sequence_hash) %>%
count(species, sequence_hash) %>%
filter(n > 1)
# Merge based on hash
pr2_hash_no_species <- pr2_hash_no_species %>%
left_join(pr2_hash_species) %>%
filter(!is.na(species)) %>%
mutate(edited_remark = "species assigned based on hash value")
write.xlsx(pr2_hash_no_species, full_path("pr2_imports_species_hash.xlsx"), zoom = 90, firstRow = TRUE, firstCol = TRUE)
9.2 Second iteration
Remove sequences similar to those already removed 2 sequences removed
10 Use dada2 AssignTaxonomy to assing the taxonomy of the Silva sequences
10.1 Export Fasta file for Silva sequences without taxonomy
186884 sequences
10.2 Launch script_assign_01.R on server
- Use 4.13.0 file
- Break up in pieces of 10 000 sequences
10.3 Build the pr2_assign table
metafile <- list()
for (i in c(1:34)) {
file_name = full_path(str_c("taxo/pr2_no_species_",sprintf("%03d", i),".dada2.taxo"))
if(file.exists(file_name)) {
metafile[[i]] <- read_tsv(file_name,
guess_max = 10000)
}
}
# The first file serves as a template for the following one when binding rows
pr2_assign <- reduce(metafile, bind_rows) %>%
distinct()
# Save the file
write.xlsx(pr2_assign, full_path("pr2_imports_silva_assigned.xlsx"), zoom = 90, firstRow = TRUE, firstCol = TRUE)