PR2 version 4.14.0
Foram from Raphael Morard
1 Reference
2 Init
source('../PR2_init.R', echo=FALSE)
source('../PR2_read_google.R', echo=FALSE)
<- pr2 %>%
pr2_18S filter(gene == "18S_rRNA")
<- pr2_18S %>%
pr2_18S_removed filter(!is.na(removed_version))
3 Set up the files
= c("Foraminifera")
target_group = "division"
target_level
<- "Foram Morard"
dir_pr2_update
$editor <- "R. Morard"
pr2.env
<- function(file_name){str_c(dir_pr2_update,"/", file_name)}
full_path
<- full_path("Foram PR2 2019-07-29_Update_RM.xlsx")
file_pr2_update_excel
# create the directory for taxonomy output
dir.create(full_path("taxo"))
4 Read the original data and reformat
4.1 Read the data
- Number of sequences = 3839
<- import(file_pr2_update_excel, sheet = "pr2", guess_max=200000, na=c("", "-"))
pr2_update_all
<- pr2_update_all %>%
pr2_update filter(REVISION != "TO BE DONE")
str_c("Number of sequences : ", nrow(pr2_update))
4.2 Add to PR2 missing sequences from Genbank
Run the script
script_genbank_xml.R
on serverRun second part
PR2-update-GenBank.R
Note: 1149 sequences were not yet in GenBank…. they have uploaded
<- filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))
pr2_missing ::datatable(pr2_missing, caption = "Updated sequences that are not present in PR2 ")
DT
# Use this data frame to download new sequences and then restart from beginning
<- pr2_missing %>%
accessions pull(genbank_accession)
saveRDS(accessions, full_path("accessions_new.rds"))
# export(pr2_missing, full_path("pr2_radiolaria_not_genbank.xlsx"))
4.3 Sequences not in Genbank are removed from update
Number of sequences in Genbank : 3839
<- pr2_update %>%
pr2_update filter(!(genbank_accession) %in% pr2_missing$genbank_accession)
str_c("Number of sequences in Genbank : ", nrow(pr2_update))
4.4 Compare with sequences in PR2
Sequences in target group in PR2: 2556
Sequences in target group in PR2 that need update: 3839 - This includes sequences that are no information at the species level and recently uploaded sequences
Updated sequences that are not active in PR2: 40 (sequences removed from PR2 because of quality problem)
Sequences duplicated: 0
chimera sequences: 0
filter(pr2_18S, !!as.symbol(target_level) %in% target_group) %>%
::datatable(caption = "Sequences of target group in pr2 ")
DT
filter(pr2_18S, (genbank_accession %in% pr2_update$genbank_accession)) %>%
::datatable(caption = "Sequences of PR2 that need update")
DT
filter(pr2_update, (genbank_accession %in% pr2_18S_removed$genbank_accession)) %>%
::datatable(caption = "Updated sequences that are not active in PR2")
DT
filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))%>%
::datatable(caption = "Updated sequences that are not present in PR2 ")
DT
left_join(select(pr2_update, genbank_accession),
select(pr2_18S, genbank_accession, pr2_accession)) %>%
count(genbank_accession) %>%
filter(n > 1) %>%
::datatable(caption = "Sequences updated with 2 entries in PR2 (e.g. with and without introns) ")
DT
filter(pr2_update, !is.na(chimera)) %>%
::datatable(caption = "Chimera") DT
5 Foram sequences in PR2 that are not in the Excel worksheet
- Sequences from target group that are not in the update: 790 (however most are in Raphael worksheet)
- Sequences from target group that are not in Excel: 15 (these sequences had been assigned based on hashtag…).
Verify that the assignation is still valid…. so everything is good….
<- filter(pr2_18S, (!!as.symbol(target_level) %in% target_group) &
pr2_not_updated !(genbank_accession %in% pr2_update_all$genbank_accession) &
is.na(removed_version)) %>%
select(pr2_accession, species, sequence_hash) %>%
left_join(select(pr2_18S, pr2_accession_1 = pr2_accession, species_1 = species, sequence_hash), by = "sequence_hash") %>%
filter( species_1 != species)
%>%
pr2_not_updated ::datatable(caption = "Sequences from target group in PR2 that are not in update") DT
6 Taxonomy
6.1 Build and check the taxonomy for the all sequences even though only some are updated
Number of taxo checked or updated: 315
<- pr2_update_all %>%
pr2_taxo_updated filter(is.na(chimera)) %>%
group_by_at(pr2.env$taxo_levels[[8]][4:8]) %>%
count()
pr2_taxo_check(pr2_taxo_updated, pr2.env$taxo_levels[[8]][4:8], full_path("taxo"))
<- pr2_taxo_updated %>%
pr2_taxo_updated rename_all(~ str_c(.,"_new" )) %>%
::rename(species = species_new) %>%
dplyrleft_join(pr2_taxo) %>%
rename_at(pr2.env$taxo_levels[[8]], ~ str_c(.,"_old" )) %>%
rename_all( ~ str_replace(.,"_new", "_8" )) %>%
::rename(species_8 = species_old) %>%
dplyrmutate(kingdom_8 = "Eukaryota",
supergroup_8 = "Rhizaria",
division_8 = "Foraminifera",
domain_9 = kingdom_8,
supergroup_9 = "TSAR",
division_9 = supergroup_8,
subdivision_9 = division_8,
class_9 = class_8,
order_9 = order_8,
family_9 = family_8,
genus_9 = genus_8,
species_9 = species_8,
taxo_edited_version = pr2.env$version,
taxo_edited_by = pr2.env$editor)
export(pr2_taxo_updated, full_path("taxo_updated.xlsx"))
6.2 Find taxa in PR2 that are not included in the update - DO NOT APPLY
- These 7 entries were edited manually
<- pr2_taxo %>%
pr2_taxo_not_updated filter(!!as.symbol(target_level) %in% target_group) %>%
filter(!(species %in% pr2_taxo_updated$species_8))
export(pr2_taxo_not_updated , full_path("taxo_not_updated.xlsx"))
7 Finalization
7.1 Sequences that need updating
<- pr2_update %>%
pr2_update_final select(genbank_accession, species, eukeref_env_biome) %>%
::rename(species_new = species) %>%
dplyrleft_join(select(pr2_main, pr2_accession, genbank_accession,
removed_version, edited_version, chimera,
edited_by, edited_remark,
species),by = "genbank_accession")
7.2 pr2_main: Sequences without species name or with different species
- Number of updated sequences with prior taxonomy: 92
- Number of updated sequences without prior taxonomy 2072
<- pr2_update_final %>%
pr2_needs_update filter((species != species_new)|is.na(species))
%>%
pr2_needs_update ::datatable(caption = "Actual sequence and pr2_accession needs to be updated ")
DT
::glue("Number of updated sequences with prior taxonomy: {nrow(filter(pr2_needs_update, !is.na(species)))}")
glue::glue("Number of updated sequences without prior taxonomy {nrow(filter(pr2_needs_update, is.na(species)))}")
glue
<- pr2_update_final %>%
pr2_main_updated select (genbank_accession,
species_8 = species_new,
species_9 = species_new,
edited_version,
edited_by,
edited_remark,
removed_version,
chimera%>%
) mutate(edited_version = str_append(edited_version, pr2.env$version) ,
edited_by = str_append(edited_by, pr2.env$editor)
# chimera = case_when(!is.na(chimera) ~ 1),
# chimera_remark = case_when(!is.na(chimera) ~ "chimera detected by M. Sandin"),
# removed_version = case_when(!is.na(chimera) ~ pr2.env$version,
# TRUE ~ removed_version )
)
7.2.1 pr2_metaddata
<- pr2_update_final %>%
pr2_metadata_updated select(genbank_accession, eukeref_env_biome) %>%
filter(!is.na(eukeref_env_biome))
8 Save everything to an Excel file
<- full_path("pr2_update_tables.xlsx")
file_pr2_imports <- list("pr2_main_updated" = pr2_main_updated,
onglets "pr2_metadata_updated" = pr2_metadata_updated,
"pr2_taxo_updated" = pr2_taxo_updated
)export(onglets, file_pr2_imports)