PR2 version 4.14.0
Cafeteriales from Alex Schoenle

1 Reference

  • Schoenle, A., Hohlfeld, M., Rosse, M., Filz, P., Wylezich, C., Nitsche, F., & Arndt, H. (2020). Global comparison of bicosoecid Cafeteria-like flagellates from the deep ocean and surface waters, with reorganization of the family Cafeteriaceae. European Journal of Protistology, 73, 125665. https://doi.org/10.1016/j.ejop.2019.125665

Comments

  • Several sequences have been deposited as Pseudobodo, but are actually Cafeteria species as well.
  • I know that Cavalier Smith has renamed several Pseudobodo species to a new species “Boroka karpovii” (Borokaceae). Within my stramenopile 18S analysis (Schoenle et al. 2020), I found the Pseudobodo sequences to be far away from the Cafeteriaceae, that is why I used the name Boroka karpovii for them. However, I do not know if it is more suitable to change only the family level instead of the name. I marked the sequences in green.
  • I blasted the sequence KX602057 again (I did not have the sequence in my 18S analysis of Stramenopiles) and it is not a Cafeteria species. But I was not sure to what I should change it. (marked red)
  • AY665996 is as well not a Cafeteria species, I have changed it to Bicosoecida sp., but I am not sure if that this is the best way. (marked red)

2 Init

  source('../PR2_init.R', echo=FALSE)
  source('../PR2_read_google.R', echo=FALSE)

  pr2_18S <- pr2 %>% 
    filter(gene == "18S_rRNA")
  
  pr2_18S_removed <- pr2_18S %>% 
    filter(!is.na(removed_version))

3 Set up the files

  target_group =  c("Cafeteriaceae")
  target_level = "family"

  dir_pr2_update <- "Cafeteria"
  
  pr2.env$editor <- "A. Schoenle"

  full_path <- function(file_name){str_c(dir_pr2_update,"/", file_name)}

  file_pr2_update_excel <- full_path("pr2_Cafeteriaceae_2020_11_21_AS.xlsx")  

# create the directory for taxonomy output
  dir.create(full_path("taxo"))

4 Read the original data and reformat

4.1 Read the data

  • Number of sequences = 72
pr2_update <- import(file_pr2_update_excel, sheet = "sequences", guess_max=200000, na=c("", "-"))
  
  str_c("Number of sequences : ", nrow(pr2_update))

4.2 Add to PR2 missing sequences from Genbank

  • Run the script script_genbank_xml.R on server

  • Run second part PR2-update-GenBank.R

  pr2_new <-  filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession)) 
  DT::datatable(pr2_new, caption = "Updated sequences that are not present in PR2 ")
  
  # Use this data frame to download new sequences and then restart from beginning
  pr2_new <- pr2_new %>% 
    select(genbank_accession)

4.3 Compare with sequences in PR2

  • Sequences in target group in PR2: 42

  • Sequences in target group in PR2 that need update: 41

  • Updated sequences that are not active in PR2: 4

  • Sequences duplicated (e.g. with and without introns): 0

  filter(pr2_18S, !!as.symbol(target_level) %in% target_group) %>% 
     DT::datatable(caption = "Sequences of target group in pr2 ") 

  filter(pr2_18S, (genbank_accession %in% pr2_update$genbank_accession)) %>% 
    DT::datatable(caption = "Sequences of PR2 that need update")
  
  filter(pr2_update, (genbank_accession %in% pr2_18S_removed$genbank_accession)) %>% 
     DT::datatable(caption = "Updated sequences that are not active in PR2")
    
  filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))%>% 
     DT::datatable(caption = "Updated sequences that are not present in PR2 ")
  
  filter(pr2_18S, (!!as.symbol(target_level) %in% target_group) & 
             !(genbank_accession %in% pr2_update$genbank_accession)) %>% 
  DT::datatable(caption = "Sequences from target group in PR2 that are not in update")
  
  left_join(select(pr2_update, genbank_accession), 
            select(pr2_18S, genbank_accession, pr2_accession)) %>% 
   count(genbank_accession) %>% 
   filter(n > 1) %>% 
   DT::datatable(caption = "Sequences updated with 2 entries in PR2  (e.g. with and without introns) ")

5 Taxonomy

5.1 Build and check

pr2_taxo_updated <- pr2_update %>% 
  group_by_at(pr2.env$taxo_levels[[8]]) %>%  
  count() 

pr2_taxo_check(pr2_taxo_updated, pr2.env$taxo_levels[[8]], full_path("taxo"))

pr2_taxo_updated <- pr2_taxo_updated %>% 
  rename_all(~ str_c(.,"_new" )) %>%
  dplyr::rename(species = species_new) %>% 
  left_join(pr2_taxo) %>% 
  rename_at(pr2.env$taxo_levels[[8]], ~ str_c(.,"_old" )) %>% 
  rename_all( ~ str_replace(.,"_new", "_8" )) %>% 
  dplyr::rename(species_8 = species_old) %>% 
  mutate(species_9 = species_8, 
         genus_9 = genus_8,
         family_9 = family_8,
         taxo_edited_version = pr2.env$version,
         taxo_edited_by = "A.Schoenle")  
  

export(pr2_taxo_updated, full_path("taxo_updated.xlsx"))

5.2 Find taxa in PR2 that are not included in the update

pr2_taxo_not_updated <- pr2_taxo %>% 
  filter(!!as.symbol(target_level) %in% target_group) %>% 
  filter(!(species %in% pr2_taxo_updated$species_8))

export(pr2_taxo_not_updated , full_path("taxo_not_updated.xlsx"))

6 Finalization

6.1 Sequences that need updating

pr2_update_final <- pr2_update %>% 
  select(genbank_accession, species, start, end) %>% 
  dplyr::rename(species_new = species, 
                start_new = start,
                end_new = end) %>% 
  left_join(select(pr2_main, pr2_accession, genbank_accession, species, start, end, removed_version, edited_remark))

6.2 Sequences with different start or end

The actual sequence would need updating the sequences in table pr2_sequences

  • No sequences found
pr2_sequence_updated <- pr2_update_final %>% 
  filter((start != start_new)|(end != end_new)) 

pr2_sequence_updated %>% 
  DT::datatable(caption = "Actual sequence and pr2_accession needs to be updated  ")

6.3 Sequences without species name or with different species

  • Sequences adde: 31
  • Sequences updated: 30
pr2_main_updated <- pr2_update_final %>% 
  filter((species != species_new)|is.na(species)) 

pr2_main_updated %>% 
  DT::datatable(caption = "Actual sequence and pr2_accession needs to be updated  ")

glue::glue("Number of updated sequences {nrow(filter(pr2_main_updated, !is.na(species)))}")
glue::glue("Number of new sequences {nrow(filter(pr2_main_updated, is.na(species)))}")

6.3.1 Add fields

pr2_main_updated <- pr2_main_updated %>%  
    select (pr2_accession, 
            species_8 = species_new, 
            species_9 = species_new, 
            removed_version,
            edited_remark) %>% 
    mutate(edited_version = pr2.env$version, 
           edited_by = pr2.env$editor) 

7 Save everything to an Excel file

 file_pr2_imports <-  full_path("pr2_imports_final.xlsx")
 onglets <- list("pr2_main_updated" = pr2_main_updated
                 # "pr2_sequences_updated" = pr2_sequence_updated
                 )
 export(onglets, file_pr2_imports)