PR2 version 4.14.0
Radiolaria from Miguel Sandin

1 Reference

Pipeline PR2_rads_pipeline.pdf (link)

Comments * 67 sequences were not in GenBank at time of update. Will need to rerun the import script with the missing sequences.

2 Init

  source('../PR2_init.R', echo=FALSE)
  source('../PR2_read_google.R', echo=FALSE)

  pr2_18S <- pr2 %>% 
    filter(gene == "18S_rRNA")
  
  pr2_18S_removed <- pr2_18S %>% 
    filter(!is.na(removed_version))

3 Set up the files

  target_group =  c("Radiolaria")
  target_level = "division"

  dir_pr2_update <- "Radiolaria Miguel"
  
  pr2.env$editor <- "M. Sandin"

  full_path <- function(file_name){str_c(dir_pr2_update,"/", file_name)}

  file_pr2_update_excel <- full_path("pr2_rads_updated.xlsx")  

# create the directory for taxonomy output
  dir.create(full_path("taxo"))

4 Read the original data and reformat

4.1 Read the data

Number of sequences = 4911

pr2_update <- import(file_pr2_update_excel, sheet = "sequences", guess_max=200000, na=c("", "-"))
  
  str_c("Number of sequences : ", nrow(pr2_update))

4.2 Add to PR2 missing sequences from Genbank

Run the script script_genbank_xml.R on server
Run second part PR2-update-GenBank.R

Note: 67 sequences are not yet in GenBank….

  pr2_missing <-  filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession)) 
  DT::datatable(pr2_missing, caption = "Updated sequences that are not present in PR2 ")
  
  # Use this data frame to download new sequences and then restart from beginning
  accessions <- pr2_missing %>% 
    pull(genbank_accession)
  
  saveRDS(accessions, full_path("accessions_new.rds"))
  
  export(pr2_missing, full_path("pr2_radiolaria_not_genbank.xlsx"))

4.3 Sequences not in Genbank are removed from update

Number of sequences in Genbank : 4845

  pr2_update <- pr2_update %>% 
    filter(!(genbank_accession) %in% pr2_missing$genbank_accession) 

  str_c("Number of sequences in Genbank : ", nrow(pr2_update))

4.4 Compare with sequences in PR2

Sequences in target group in PR2: 4633
Sequences in target group in PR2 that need update: 4844
Updated sequences that are not active in PR2: 32 (sequences removed from PR2)
Sequences from target group that are not in the update: 9 (these sequences have been removed from PR2 because too many ambiguities)
Sequences duplicated: 0
chimera sequences: 343

  filter(pr2_18S, !!as.symbol(target_level) %in% target_group) %>% 
     DT::datatable(caption = "Sequences of target group in pr2 ") 

  filter(pr2_18S, (genbank_accession %in% pr2_update$genbank_accession)) %>% 
    DT::datatable(caption = "Sequences of PR2 that need update")
  
  filter(pr2_update, (genbank_accession %in% pr2_18S_removed$genbank_accession)) %>% 
     DT::datatable(caption = "Updated sequences that are not active in PR2")
    
  filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))%>% 
     DT::datatable(caption = "Updated sequences that are not present in PR2 ")
  
  filter(pr2_18S, (!!as.symbol(target_level) %in% target_group) & 
             !(genbank_accession %in% pr2_update$genbank_accession)) %>% 
  DT::datatable(caption = "Sequences from target group in PR2 that are not in update")
  
  left_join(select(pr2_update, genbank_accession), 
            select(pr2_18S, genbank_accession, pr2_accession)) %>% 
   count(genbank_accession) %>% 
   filter(n > 1) %>% 
   DT::datatable(caption = "Sequences updated with 2 entries in PR2  (e.g. with and without introns) ")
  
   filter(pr2_update, !is.na(chimera)) %>% 
     DT::datatable(caption = "Chimera")

5 Taxonomy

5.1 Build and check

The old taxonomy is removed and the new taxonomy replaces it…

pr2_taxo_updated <- pr2_update %>% 
  filter(is.na(chimera)) %>% 
  group_by_at(pr2.env$taxo_levels[[8]][4:8]) %>%  
  count() 

pr2_taxo_check(pr2_taxo_updated, pr2.env$taxo_levels[[8]][4:8], full_path("taxo"))

pr2_taxo_updated <- pr2_taxo_updated %>% 
  rename_all(~ str_c(.,"_new" )) %>%
  dplyr::rename(species = species_new) %>% 
  left_join(pr2_taxo) %>% 
  rename_at(pr2.env$taxo_levels[[8]], ~ str_c(.,"_old" )) %>% 
  rename_all( ~ str_replace(.,"_new", "_8" )) %>% 
  dplyr::rename(species_8 = species_old) %>% 
  mutate(kingdom_8 = "Eukaryota",
         supergroup_8 = "Rhizaria",
         division_8 = "Radiolaria",
         domain_9 = "Eukaryota",
         supergroup_9 = "TSAR",
         division_9 = "Rhizaria",
         subdivision_9 = "Radiolaria",
         class_9 = class_8,
         order_9 = order_8,
         family_9 = family_8, 
         genus_9 = genus_8,
         species_9 = species_8,
         taxo_edited_version = pr2.env$version,
         taxo_edited_by = "M. Sendin")  
  
export(pr2_taxo_updated, full_path("taxo/taxo_updated.xlsx"))

5.2 Find taxa in PR2 that are not included in the update - DO NOT APPLY

The old taxonomy is removed

pr2_taxo_not_updated <- pr2_taxo %>% 
  filter(!!as.symbol(target_level) %in% target_group) %>% 
  filter(!(species %in% pr2_taxo_updated$species_8))

export(pr2_taxo_not_updated , full_path("taxo_not_updated.xlsx"))

6 Finalization

6.1 Sequences that need updating

pr2_update_final <- pr2_update %>% 
  select(genbank_accession, species, start, end, chimera, eukref_biotic_relationship) %>% 
  dplyr::rename(species_new = species, 
                start_new = start,
                end_new = end) %>% 
  left_join(select(pr2_main, pr2_accession, genbank_accession, species, start, end, removed_version, edited_remark))

6.2 Sequences with different start or end

9 sequences have been reloaded into PR2 using the Genbank script.
These sequences had been initially uploaded with Silva which had imperfect trimming of the end.

pr2_sequence_to_update <- pr2_update_final %>% 
  filter((start != start_new)|(end != end_new)) 

pr2_sequence_to_update %>% 
  DT::datatable(caption = "Actual sequence and pr2_accession needs to be updated  ")

6.3 pr2_main: Sequences without species name or with different species

Number of updated sequences with prior taxonomy: 4399
Number of updated sequences without prior taxonomy 220

pr2_main_updated <- pr2_update_final %>% 
  filter((species != species_new)|is.na(species) |!is.na(chimera)) 

pr2_main_updated %>% 
  DT::datatable(caption = "Actual sequence and pr2_accession needs to be updated  ")

glue::glue("Number of updated sequences  with prior taxonomy: {nrow(filter(pr2_main_updated, !is.na(species)))}")
glue::glue("Number of updated sequences  without prior taxonomy {nrow(filter(pr2_main_updated, is.na(species)))}")

pr2_main_updated <- pr2_main_updated %>%  
    select (genbank_accession, 
            species_8 = species_new, 
            species_9 = species_new, 
            removed_version,
            edited_remark,
            chimera) %>% 
    mutate(edited_version = pr2.env$version, 
           edited_by = pr2.env$editor,
           chimera = case_when(!is.na(chimera) ~ 1),
           chimera_remark = case_when(!is.na(chimera) ~ "chimera detected by M. Sandin"),
           removed_version = case_when(!is.na(chimera) ~ pr2.env$version,
                                       TRUE ~ removed_version ))

6.3.1 pr2_metaddata

pr2_metadata_updated <- pr2_update_final %>%
  select(genbank_accession, eukref_biotic_relationship) %>% 
  filter(!is.na(eukref_biotic_relationship))

7 Save everything to an Excel file

 file_pr2_imports <-  full_path("pr2_update_tables.xlsx")
 onglets <- list("pr2_main_updated" = pr2_main_updated,
                 "pr2_metadata_updated" = pr2_metadata_updated,
                 "pr2_taxo_updated" = pr2_taxo_updated
                 )
 export(onglets, file_pr2_imports)

PR2 version 4.14.0 Radiolaria from Miguel Sandin

PR2 version 4.14.0 Radiolaria from Miguel Sandin

1 Reference

2 Init

3 Set up the files

4 Read the original data and reformat

4.1 Read the data

4.2 Add to PR2 missing sequences from Genbank

4.3 Sequences not in Genbank are removed from update

4.4 Compare with sequences in PR2

5 Taxonomy

5.1 Build and check

5.2 Find taxa in PR2 that are not included in the update - DO NOT APPLY

6 Finalization

6.1 Sequences that need updating

6.2 Sequences with different start or end

6.3 pr2_main: Sequences without species name or with different species

6.3.1 pr2_metaddata

7 Save everything to an Excel file

PR2 version 4.14.0
Radiolaria from Miguel Sandin

PR2 version 4.14.0
Radiolaria from Miguel Sandin