PR2 version 4.14.0
Cafeteriales from Alex Schoenle
1 Reference
- Schoenle, A., Hohlfeld, M., Rosse, M., Filz, P., Wylezich, C., Nitsche, F., & Arndt, H. (2020). Global comparison of bicosoecid Cafeteria-like flagellates from the deep ocean and surface waters, with reorganization of the family Cafeteriaceae. European Journal of Protistology, 73, 125665. https://doi.org/10.1016/j.ejop.2019.125665
Comments
- Several sequences have been deposited as Pseudobodo, but are actually Cafeteria species as well.
- I know that Cavalier Smith has renamed several Pseudobodo species to a new species “Boroka karpovii” (Borokaceae). Within my stramenopile 18S analysis (Schoenle et al. 2020), I found the Pseudobodo sequences to be far away from the Cafeteriaceae, that is why I used the name Boroka karpovii for them. However, I do not know if it is more suitable to change only the family level instead of the name. I marked the sequences in green.
- I blasted the sequence KX602057 again (I did not have the sequence in my 18S analysis of Stramenopiles) and it is not a Cafeteria species. But I was not sure to what I should change it. (marked red)
- AY665996 is as well not a Cafeteria species, I have changed it to Bicosoecida sp., but I am not sure if that this is the best way. (marked red)
2 Init
source('../PR2_init.R', echo=FALSE)
source('../PR2_read_google.R', echo=FALSE)
pr2_18S <- pr2 %>%
filter(gene == "18S_rRNA")
pr2_18S_removed <- pr2_18S %>%
filter(!is.na(removed_version))3 Set up the files
target_group = c("Cafeteriaceae")
target_level = "family"
dir_pr2_update <- "Cafeteria"
pr2.env$editor <- "A. Schoenle"
full_path <- function(file_name){str_c(dir_pr2_update,"/", file_name)}
file_pr2_update_excel <- full_path("pr2_Cafeteriaceae_2020_11_21_AS.xlsx")
# create the directory for taxonomy output
dir.create(full_path("taxo"))4 Read the original data and reformat
4.1 Read the data
- Number of sequences = 72
pr2_update <- import(file_pr2_update_excel, sheet = "sequences", guess_max=200000, na=c("", "-"))
str_c("Number of sequences : ", nrow(pr2_update))4.2 Add to PR2 missing sequences from Genbank
Run the script
script_genbank_xml.Ron serverRun second part
PR2-update-GenBank.R
pr2_new <- filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))
DT::datatable(pr2_new, caption = "Updated sequences that are not present in PR2 ")
# Use this data frame to download new sequences and then restart from beginning
pr2_new <- pr2_new %>%
select(genbank_accession)4.3 Compare with sequences in PR2
Sequences in target group in PR2: 42
Sequences in target group in PR2 that need update: 41
Updated sequences that are not active in PR2: 4
Sequences duplicated (e.g. with and without introns): 0
filter(pr2_18S, !!as.symbol(target_level) %in% target_group) %>%
DT::datatable(caption = "Sequences of target group in pr2 ")
filter(pr2_18S, (genbank_accession %in% pr2_update$genbank_accession)) %>%
DT::datatable(caption = "Sequences of PR2 that need update")
filter(pr2_update, (genbank_accession %in% pr2_18S_removed$genbank_accession)) %>%
DT::datatable(caption = "Updated sequences that are not active in PR2")
filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))%>%
DT::datatable(caption = "Updated sequences that are not present in PR2 ")
filter(pr2_18S, (!!as.symbol(target_level) %in% target_group) &
!(genbank_accession %in% pr2_update$genbank_accession)) %>%
DT::datatable(caption = "Sequences from target group in PR2 that are not in update")
left_join(select(pr2_update, genbank_accession),
select(pr2_18S, genbank_accession, pr2_accession)) %>%
count(genbank_accession) %>%
filter(n > 1) %>%
DT::datatable(caption = "Sequences updated with 2 entries in PR2 (e.g. with and without introns) ")5 Taxonomy
5.1 Build and check
pr2_taxo_updated <- pr2_update %>%
group_by_at(pr2.env$taxo_levels[[8]]) %>%
count()
pr2_taxo_check(pr2_taxo_updated, pr2.env$taxo_levels[[8]], full_path("taxo"))
pr2_taxo_updated <- pr2_taxo_updated %>%
rename_all(~ str_c(.,"_new" )) %>%
dplyr::rename(species = species_new) %>%
left_join(pr2_taxo) %>%
rename_at(pr2.env$taxo_levels[[8]], ~ str_c(.,"_old" )) %>%
rename_all( ~ str_replace(.,"_new", "_8" )) %>%
dplyr::rename(species_8 = species_old) %>%
mutate(species_9 = species_8,
genus_9 = genus_8,
family_9 = family_8,
taxo_edited_version = pr2.env$version,
taxo_edited_by = "A.Schoenle")
export(pr2_taxo_updated, full_path("taxo_updated.xlsx"))5.2 Find taxa in PR2 that are not included in the update
pr2_taxo_not_updated <- pr2_taxo %>%
filter(!!as.symbol(target_level) %in% target_group) %>%
filter(!(species %in% pr2_taxo_updated$species_8))
export(pr2_taxo_not_updated , full_path("taxo_not_updated.xlsx"))6 Finalization
6.1 Sequences that need updating
pr2_update_final <- pr2_update %>%
select(genbank_accession, species, start, end) %>%
dplyr::rename(species_new = species,
start_new = start,
end_new = end) %>%
left_join(select(pr2_main, pr2_accession, genbank_accession, species, start, end, removed_version, edited_remark))6.2 Sequences with different start or end
The actual sequence would need updating the sequences in table pr2_sequences
- No sequences found
pr2_sequence_updated <- pr2_update_final %>%
filter((start != start_new)|(end != end_new))
pr2_sequence_updated %>%
DT::datatable(caption = "Actual sequence and pr2_accession needs to be updated ")6.3 Sequences without species name or with different species
- Sequences adde: 31
- Sequences updated: 30
pr2_main_updated <- pr2_update_final %>%
filter((species != species_new)|is.na(species))
pr2_main_updated %>%
DT::datatable(caption = "Actual sequence and pr2_accession needs to be updated ")
glue::glue("Number of updated sequences {nrow(filter(pr2_main_updated, !is.na(species)))}")
glue::glue("Number of new sequences {nrow(filter(pr2_main_updated, is.na(species)))}")6.3.1 Add fields
pr2_main_updated <- pr2_main_updated %>%
select (pr2_accession,
species_8 = species_new,
species_9 = species_new,
removed_version,
edited_remark) %>%
mutate(edited_version = pr2.env$version,
edited_by = pr2.env$editor) 7 Save everything to an Excel file
file_pr2_imports <- full_path("pr2_imports_final.xlsx")
onglets <- list("pr2_main_updated" = pr2_main_updated
# "pr2_sequences_updated" = pr2_sequence_updated
)
export(onglets, file_pr2_imports)