PR2 version 4.14.0
Foram from Raphael Morard

1 Reference

2 Init

  source('../PR2_init.R', echo=FALSE)
  source('../PR2_read_google.R', echo=FALSE)

  pr2_18S <- pr2 %>% 
    filter(gene == "18S_rRNA")
  
  pr2_18S_removed <- pr2_18S %>% 
    filter(!is.na(removed_version))

3 Set up the files

  target_group =  c("Foraminifera")
  target_level = "division"

  dir_pr2_update <- "Foram Morard"
  
  pr2.env$editor <- "R. Morard"

  full_path <- function(file_name){str_c(dir_pr2_update,"/", file_name)}

  file_pr2_update_excel <- full_path("Foram PR2 2019-07-29_Update_RM.xlsx")  

# create the directory for taxonomy output
  dir.create(full_path("taxo"))

4 Read the original data and reformat

4.1 Read the data

  • Number of sequences = 3839
pr2_update_all <- import(file_pr2_update_excel, sheet = "pr2", guess_max=200000, na=c("", "-"))

pr2_update <- pr2_update_all %>% 
  filter(REVISION != "TO BE DONE")

  str_c("Number of sequences : ", nrow(pr2_update))

4.2 Add to PR2 missing sequences from Genbank

  • Run the script script_genbank_xml.R on server

  • Run second part PR2-update-GenBank.R

Note: 1149 sequences were not yet in GenBank…. they have uploaded

  pr2_missing <-  filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession)) 
  DT::datatable(pr2_missing, caption = "Updated sequences that are not present in PR2 ")
  
  # Use this data frame to download new sequences and then restart from beginning
  accessions <- pr2_missing %>% 
    pull(genbank_accession)
  
  saveRDS(accessions, full_path("accessions_new.rds"))
  
  # export(pr2_missing, full_path("pr2_radiolaria_not_genbank.xlsx"))

4.3 Sequences not in Genbank are removed from update

Number of sequences in Genbank : 3839

  pr2_update <- pr2_update %>% 
    filter(!(genbank_accession) %in% pr2_missing$genbank_accession) 

  str_c("Number of sequences in Genbank : ", nrow(pr2_update))

4.4 Compare with sequences in PR2

  • Sequences in target group in PR2: 2556

  • Sequences in target group in PR2 that need update: 3839 - This includes sequences that are no information at the species level and recently uploaded sequences

  • Updated sequences that are not active in PR2: 40 (sequences removed from PR2 because of quality problem)

  • Sequences duplicated: 0

  • chimera sequences: 0

  filter(pr2_18S, !!as.symbol(target_level) %in% target_group) %>% 
     DT::datatable(caption = "Sequences of target group in pr2 ") 

  filter(pr2_18S, (genbank_accession %in% pr2_update$genbank_accession)) %>% 
    DT::datatable(caption = "Sequences of PR2 that need update")
  
  filter(pr2_update, (genbank_accession %in% pr2_18S_removed$genbank_accession)) %>% 
     DT::datatable(caption = "Updated sequences that are not active in PR2")
  
  filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))%>% 
     DT::datatable(caption = "Updated sequences that are not present in PR2 ")
  
  left_join(select(pr2_update, genbank_accession), 
            select(pr2_18S, genbank_accession, pr2_accession)) %>% 
   count(genbank_accession) %>% 
   filter(n > 1) %>% 
   DT::datatable(caption = "Sequences updated with 2 entries in PR2  (e.g. with and without introns) ")
  
   filter(pr2_update, !is.na(chimera)) %>% 
     DT::datatable(caption = "Chimera")

5 Foram sequences in PR2 that are not in the Excel worksheet

  • Sequences from target group that are not in the update: 790 (however most are in Raphael worksheet)
  • Sequences from target group that are not in Excel: 15 (these sequences had been assigned based on hashtag…).

Verify that the assignation is still valid…. so everything is good….

  pr2_not_updated <- filter(pr2_18S, (!!as.symbol(target_level) %in% target_group) & 
             !(genbank_accession %in% pr2_update_all$genbank_accession) &
           is.na(removed_version)) %>% 
    select(pr2_accession, species, sequence_hash) %>% 
    left_join(select(pr2_18S, pr2_accession_1 = pr2_accession, species_1 = species, sequence_hash), by = "sequence_hash") %>% 
    filter( species_1 != species)
    
  pr2_not_updated %>% 
  DT::datatable(caption = "Sequences from target group in PR2 that are not in update")

6 Taxonomy

6.1 Build and check the taxonomy for the all sequences even though only some are updated

Number of taxo checked or updated: 315

pr2_taxo_updated <- pr2_update_all %>% 
  filter(is.na(chimera)) %>% 
  group_by_at(pr2.env$taxo_levels[[8]][4:8]) %>%  
  count() 

pr2_taxo_check(pr2_taxo_updated, pr2.env$taxo_levels[[8]][4:8], full_path("taxo"))

pr2_taxo_updated <- pr2_taxo_updated %>% 
  rename_all(~ str_c(.,"_new" )) %>%
  dplyr::rename(species = species_new) %>% 
  left_join(pr2_taxo) %>% 
  rename_at(pr2.env$taxo_levels[[8]], ~ str_c(.,"_old" )) %>% 
  rename_all( ~ str_replace(.,"_new", "_8" )) %>% 
  dplyr::rename(species_8 = species_old) %>% 
  mutate(kingdom_8 = "Eukaryota",
         supergroup_8 = "Rhizaria",
         division_8 = "Foraminifera",
         domain_9 = kingdom_8,
         supergroup_9 = "TSAR",
         division_9 = supergroup_8,
         subdivision_9 =  division_8,
         class_9 = class_8,
         order_9 = order_8,
         family_9 = family_8, 
         genus_9 = genus_8,
         species_9 = species_8,
         taxo_edited_version = pr2.env$version,
         taxo_edited_by = pr2.env$editor)  
  
export(pr2_taxo_updated, full_path("taxo_updated.xlsx"))

6.2 Find taxa in PR2 that are not included in the update - DO NOT APPLY

  • These 7 entries were edited manually
pr2_taxo_not_updated <- pr2_taxo %>% 
  filter(!!as.symbol(target_level) %in% target_group) %>% 
  filter(!(species %in% pr2_taxo_updated$species_8))

export(pr2_taxo_not_updated , full_path("taxo_not_updated.xlsx"))

7 Finalization

7.1 Sequences that need updating

pr2_update_final <- pr2_update %>% 
  select(genbank_accession, species, eukeref_env_biome) %>% 
  dplyr::rename(species_new = species) %>% 
  left_join(select(pr2_main, pr2_accession, genbank_accession, 
                   removed_version, edited_version, chimera,
                   edited_by, edited_remark,
                   species),
            by = "genbank_accession")

7.2 pr2_main: Sequences without species name or with different species

  • Number of updated sequences with prior taxonomy: 92
  • Number of updated sequences without prior taxonomy 2072
pr2_needs_update <- pr2_update_final %>%  
  filter((species != species_new)|is.na(species)) 

pr2_needs_update %>% 
  DT::datatable(caption = "Actual sequence and pr2_accession needs to be updated  ")

glue::glue("Number of updated sequences  with prior taxonomy: {nrow(filter(pr2_needs_update, !is.na(species)))}")
glue::glue("Number of updated sequences  without prior taxonomy {nrow(filter(pr2_needs_update, is.na(species)))}")

pr2_main_updated <- pr2_update_final %>%  
    select (genbank_accession, 
            species_8 = species_new, 
            species_9 = species_new,
            edited_version,
            edited_by,
            edited_remark,
            removed_version,
            chimera
            ) %>% 
    mutate(edited_version = str_append(edited_version, pr2.env$version) , 
           edited_by = str_append(edited_by, pr2.env$editor)
           # chimera = case_when(!is.na(chimera) ~ 1),
           # chimera_remark = case_when(!is.na(chimera) ~ "chimera detected by M. Sandin"),
           # removed_version = case_when(!is.na(chimera) ~ pr2.env$version,
                                       # TRUE ~ removed_version )
           ) 

7.2.1 pr2_metaddata

pr2_metadata_updated <- pr2_update_final %>%
  select(genbank_accession, eukeref_env_biome) %>% 
  filter(!is.na(eukeref_env_biome))

8 Save everything to an Excel file

 file_pr2_imports <-  full_path("pr2_update_tables.xlsx")
 onglets <- list("pr2_main_updated" = pr2_main_updated,
                 "pr2_metadata_updated" = pr2_metadata_updated,
                 "pr2_taxo_updated" = pr2_taxo_updated
                 )
 export(onglets, file_pr2_imports)