PR2 version 4.14.0
Foram from Raphael Morard
1 Reference
2 Init
source('../PR2_init.R', echo=FALSE)
source('../PR2_read_google.R', echo=FALSE)
pr2_18S <- pr2 %>%
filter(gene == "18S_rRNA")
pr2_18S_removed <- pr2_18S %>%
filter(!is.na(removed_version))3 Set up the files
target_group = c("Foraminifera")
target_level = "division"
dir_pr2_update <- "Foram Morard"
pr2.env$editor <- "R. Morard"
full_path <- function(file_name){str_c(dir_pr2_update,"/", file_name)}
file_pr2_update_excel <- full_path("Foram PR2 2019-07-29_Update_RM.xlsx")
# create the directory for taxonomy output
dir.create(full_path("taxo"))4 Read the original data and reformat
4.1 Read the data
- Number of sequences = 3839
pr2_update_all <- import(file_pr2_update_excel, sheet = "pr2", guess_max=200000, na=c("", "-"))
pr2_update <- pr2_update_all %>%
filter(REVISION != "TO BE DONE")
str_c("Number of sequences : ", nrow(pr2_update))4.2 Add to PR2 missing sequences from Genbank
Run the script
script_genbank_xml.Ron serverRun second part
PR2-update-GenBank.R
Note: 1149 sequences were not yet in GenBank…. they have uploaded
pr2_missing <- filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))
DT::datatable(pr2_missing, caption = "Updated sequences that are not present in PR2 ")
# Use this data frame to download new sequences and then restart from beginning
accessions <- pr2_missing %>%
pull(genbank_accession)
saveRDS(accessions, full_path("accessions_new.rds"))
# export(pr2_missing, full_path("pr2_radiolaria_not_genbank.xlsx"))4.3 Sequences not in Genbank are removed from update
Number of sequences in Genbank : 3839
pr2_update <- pr2_update %>%
filter(!(genbank_accession) %in% pr2_missing$genbank_accession)
str_c("Number of sequences in Genbank : ", nrow(pr2_update))4.4 Compare with sequences in PR2
Sequences in target group in PR2: 2556
Sequences in target group in PR2 that need update: 3839 - This includes sequences that are no information at the species level and recently uploaded sequences
Updated sequences that are not active in PR2: 40 (sequences removed from PR2 because of quality problem)
Sequences duplicated: 0
chimera sequences: 0
filter(pr2_18S, !!as.symbol(target_level) %in% target_group) %>%
DT::datatable(caption = "Sequences of target group in pr2 ")
filter(pr2_18S, (genbank_accession %in% pr2_update$genbank_accession)) %>%
DT::datatable(caption = "Sequences of PR2 that need update")
filter(pr2_update, (genbank_accession %in% pr2_18S_removed$genbank_accession)) %>%
DT::datatable(caption = "Updated sequences that are not active in PR2")
filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))%>%
DT::datatable(caption = "Updated sequences that are not present in PR2 ")
left_join(select(pr2_update, genbank_accession),
select(pr2_18S, genbank_accession, pr2_accession)) %>%
count(genbank_accession) %>%
filter(n > 1) %>%
DT::datatable(caption = "Sequences updated with 2 entries in PR2 (e.g. with and without introns) ")
filter(pr2_update, !is.na(chimera)) %>%
DT::datatable(caption = "Chimera")5 Foram sequences in PR2 that are not in the Excel worksheet
- Sequences from target group that are not in the update: 790 (however most are in Raphael worksheet)
- Sequences from target group that are not in Excel: 15 (these sequences had been assigned based on hashtag…).
Verify that the assignation is still valid…. so everything is good….
pr2_not_updated <- filter(pr2_18S, (!!as.symbol(target_level) %in% target_group) &
!(genbank_accession %in% pr2_update_all$genbank_accession) &
is.na(removed_version)) %>%
select(pr2_accession, species, sequence_hash) %>%
left_join(select(pr2_18S, pr2_accession_1 = pr2_accession, species_1 = species, sequence_hash), by = "sequence_hash") %>%
filter( species_1 != species)
pr2_not_updated %>%
DT::datatable(caption = "Sequences from target group in PR2 that are not in update")6 Taxonomy
6.1 Build and check the taxonomy for the all sequences even though only some are updated
Number of taxo checked or updated: 315
pr2_taxo_updated <- pr2_update_all %>%
filter(is.na(chimera)) %>%
group_by_at(pr2.env$taxo_levels[[8]][4:8]) %>%
count()
pr2_taxo_check(pr2_taxo_updated, pr2.env$taxo_levels[[8]][4:8], full_path("taxo"))
pr2_taxo_updated <- pr2_taxo_updated %>%
rename_all(~ str_c(.,"_new" )) %>%
dplyr::rename(species = species_new) %>%
left_join(pr2_taxo) %>%
rename_at(pr2.env$taxo_levels[[8]], ~ str_c(.,"_old" )) %>%
rename_all( ~ str_replace(.,"_new", "_8" )) %>%
dplyr::rename(species_8 = species_old) %>%
mutate(kingdom_8 = "Eukaryota",
supergroup_8 = "Rhizaria",
division_8 = "Foraminifera",
domain_9 = kingdom_8,
supergroup_9 = "TSAR",
division_9 = supergroup_8,
subdivision_9 = division_8,
class_9 = class_8,
order_9 = order_8,
family_9 = family_8,
genus_9 = genus_8,
species_9 = species_8,
taxo_edited_version = pr2.env$version,
taxo_edited_by = pr2.env$editor)
export(pr2_taxo_updated, full_path("taxo_updated.xlsx"))6.2 Find taxa in PR2 that are not included in the update - DO NOT APPLY
- These 7 entries were edited manually
pr2_taxo_not_updated <- pr2_taxo %>%
filter(!!as.symbol(target_level) %in% target_group) %>%
filter(!(species %in% pr2_taxo_updated$species_8))
export(pr2_taxo_not_updated , full_path("taxo_not_updated.xlsx"))7 Finalization
7.1 Sequences that need updating
pr2_update_final <- pr2_update %>%
select(genbank_accession, species, eukeref_env_biome) %>%
dplyr::rename(species_new = species) %>%
left_join(select(pr2_main, pr2_accession, genbank_accession,
removed_version, edited_version, chimera,
edited_by, edited_remark,
species),
by = "genbank_accession")7.2 pr2_main: Sequences without species name or with different species
- Number of updated sequences with prior taxonomy: 92
- Number of updated sequences without prior taxonomy 2072
pr2_needs_update <- pr2_update_final %>%
filter((species != species_new)|is.na(species))
pr2_needs_update %>%
DT::datatable(caption = "Actual sequence and pr2_accession needs to be updated ")
glue::glue("Number of updated sequences with prior taxonomy: {nrow(filter(pr2_needs_update, !is.na(species)))}")
glue::glue("Number of updated sequences without prior taxonomy {nrow(filter(pr2_needs_update, is.na(species)))}")
pr2_main_updated <- pr2_update_final %>%
select (genbank_accession,
species_8 = species_new,
species_9 = species_new,
edited_version,
edited_by,
edited_remark,
removed_version,
chimera
) %>%
mutate(edited_version = str_append(edited_version, pr2.env$version) ,
edited_by = str_append(edited_by, pr2.env$editor)
# chimera = case_when(!is.na(chimera) ~ 1),
# chimera_remark = case_when(!is.na(chimera) ~ "chimera detected by M. Sandin"),
# removed_version = case_when(!is.na(chimera) ~ pr2.env$version,
# TRUE ~ removed_version )
) 7.2.1 pr2_metaddata
pr2_metadata_updated <- pr2_update_final %>%
select(genbank_accession, eukeref_env_biome) %>%
filter(!is.na(eukeref_env_biome))8 Save everything to an Excel file
file_pr2_imports <- full_path("pr2_update_tables.xlsx")
onglets <- list("pr2_main_updated" = pr2_main_updated,
"pr2_metadata_updated" = pr2_metadata_updated,
"pr2_taxo_updated" = pr2_taxo_updated
)
export(onglets, file_pr2_imports)