PR2 version 4.14.0
Cafeteriales from Alex Schoenle
1 Reference
- Schoenle, A., Hohlfeld, M., Rosse, M., Filz, P., Wylezich, C., Nitsche, F., & Arndt, H. (2020). Global comparison of bicosoecid Cafeteria-like flagellates from the deep ocean and surface waters, with reorganization of the family Cafeteriaceae. European Journal of Protistology, 73, 125665. https://doi.org/10.1016/j.ejop.2019.125665
Comments
- Several sequences have been deposited as Pseudobodo, but are actually Cafeteria species as well.
- I know that Cavalier Smith has renamed several Pseudobodo species to a new species “Boroka karpovii” (Borokaceae). Within my stramenopile 18S analysis (Schoenle et al. 2020), I found the Pseudobodo sequences to be far away from the Cafeteriaceae, that is why I used the name Boroka karpovii for them. However, I do not know if it is more suitable to change only the family level instead of the name. I marked the sequences in green.
- I blasted the sequence KX602057 again (I did not have the sequence in my 18S analysis of Stramenopiles) and it is not a Cafeteria species. But I was not sure to what I should change it. (marked red)
- AY665996 is as well not a Cafeteria species, I have changed it to Bicosoecida sp., but I am not sure if that this is the best way. (marked red)
2 Init
source('../PR2_init.R', echo=FALSE)
source('../PR2_read_google.R', echo=FALSE)
<- pr2 %>%
pr2_18S filter(gene == "18S_rRNA")
<- pr2_18S %>%
pr2_18S_removed filter(!is.na(removed_version))
3 Set up the files
= c("Cafeteriaceae")
target_group = "family"
target_level
<- "Cafeteria"
dir_pr2_update
$editor <- "A. Schoenle"
pr2.env
<- function(file_name){str_c(dir_pr2_update,"/", file_name)}
full_path
<- full_path("pr2_Cafeteriaceae_2020_11_21_AS.xlsx")
file_pr2_update_excel
# create the directory for taxonomy output
dir.create(full_path("taxo"))
4 Read the original data and reformat
4.1 Read the data
- Number of sequences = 72
<- import(file_pr2_update_excel, sheet = "sequences", guess_max=200000, na=c("", "-"))
pr2_update
str_c("Number of sequences : ", nrow(pr2_update))
4.2 Add to PR2 missing sequences from Genbank
Run the script
script_genbank_xml.R
on serverRun second part
PR2-update-GenBank.R
<- filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))
pr2_new ::datatable(pr2_new, caption = "Updated sequences that are not present in PR2 ")
DT
# Use this data frame to download new sequences and then restart from beginning
<- pr2_new %>%
pr2_new select(genbank_accession)
4.3 Compare with sequences in PR2
Sequences in target group in PR2: 42
Sequences in target group in PR2 that need update: 41
Updated sequences that are not active in PR2: 4
Sequences duplicated (e.g. with and without introns): 0
filter(pr2_18S, !!as.symbol(target_level) %in% target_group) %>%
::datatable(caption = "Sequences of target group in pr2 ")
DT
filter(pr2_18S, (genbank_accession %in% pr2_update$genbank_accession)) %>%
::datatable(caption = "Sequences of PR2 that need update")
DT
filter(pr2_update, (genbank_accession %in% pr2_18S_removed$genbank_accession)) %>%
::datatable(caption = "Updated sequences that are not active in PR2")
DT
filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))%>%
::datatable(caption = "Updated sequences that are not present in PR2 ")
DT
filter(pr2_18S, (!!as.symbol(target_level) %in% target_group) &
!(genbank_accession %in% pr2_update$genbank_accession)) %>%
::datatable(caption = "Sequences from target group in PR2 that are not in update")
DT
left_join(select(pr2_update, genbank_accession),
select(pr2_18S, genbank_accession, pr2_accession)) %>%
count(genbank_accession) %>%
filter(n > 1) %>%
::datatable(caption = "Sequences updated with 2 entries in PR2 (e.g. with and without introns) ") DT
5 Taxonomy
5.1 Build and check
<- pr2_update %>%
pr2_taxo_updated group_by_at(pr2.env$taxo_levels[[8]]) %>%
count()
pr2_taxo_check(pr2_taxo_updated, pr2.env$taxo_levels[[8]], full_path("taxo"))
<- pr2_taxo_updated %>%
pr2_taxo_updated rename_all(~ str_c(.,"_new" )) %>%
::rename(species = species_new) %>%
dplyrleft_join(pr2_taxo) %>%
rename_at(pr2.env$taxo_levels[[8]], ~ str_c(.,"_old" )) %>%
rename_all( ~ str_replace(.,"_new", "_8" )) %>%
::rename(species_8 = species_old) %>%
dplyrmutate(species_9 = species_8,
genus_9 = genus_8,
family_9 = family_8,
taxo_edited_version = pr2.env$version,
taxo_edited_by = "A.Schoenle")
export(pr2_taxo_updated, full_path("taxo_updated.xlsx"))
5.2 Find taxa in PR2 that are not included in the update
<- pr2_taxo %>%
pr2_taxo_not_updated filter(!!as.symbol(target_level) %in% target_group) %>%
filter(!(species %in% pr2_taxo_updated$species_8))
export(pr2_taxo_not_updated , full_path("taxo_not_updated.xlsx"))
6 Finalization
6.1 Sequences that need updating
<- pr2_update %>%
pr2_update_final select(genbank_accession, species, start, end) %>%
::rename(species_new = species,
dplyrstart_new = start,
end_new = end) %>%
left_join(select(pr2_main, pr2_accession, genbank_accession, species, start, end, removed_version, edited_remark))
6.2 Sequences with different start or end
The actual sequence would need updating the sequences in table pr2_sequences
- No sequences found
<- pr2_update_final %>%
pr2_sequence_updated filter((start != start_new)|(end != end_new))
%>%
pr2_sequence_updated ::datatable(caption = "Actual sequence and pr2_accession needs to be updated ") DT
6.3 Sequences without species name or with different species
- Sequences adde: 31
- Sequences updated: 30
<- pr2_update_final %>%
pr2_main_updated filter((species != species_new)|is.na(species))
%>%
pr2_main_updated ::datatable(caption = "Actual sequence and pr2_accession needs to be updated ")
DT
::glue("Number of updated sequences {nrow(filter(pr2_main_updated, !is.na(species)))}")
glue::glue("Number of new sequences {nrow(filter(pr2_main_updated, is.na(species)))}") glue
6.3.1 Add fields
<- pr2_main_updated %>%
pr2_main_updated select (pr2_accession,
species_8 = species_new,
species_9 = species_new,
removed_version,%>%
edited_remark) mutate(edited_version = pr2.env$version,
edited_by = pr2.env$editor)
7 Save everything to an Excel file
<- full_path("pr2_imports_final.xlsx")
file_pr2_imports <- list("pr2_main_updated" = pr2_main_updated
onglets # "pr2_sequences_updated" = pr2_sequence_updated
)export(onglets, file_pr2_imports)