PR2 version 4.11.0
Chlorophyta - Chloropicophyceae
PR2 version 4.11.0
Chlorophyta - Chloropicophyceae
- 1 Main changes
- 2 References used
- 3 Read pr2
- 4 Prasinophytes clade VII
- 5 Construct taxonomy and compare to existing taxonomy
- 6 Old sequences
- 7 New sequences
- 7.1 Extract lines with new accession number
- 7.2 Download the sequences and parse them
- 7.3 Extract features
- 7.4 Update sequence_start and sequence_end from feature table
- 7.5 Merge the accession, taxonomy, sequence and metadata
- 7.6 Extract sequences and get length and number of ambiguities
- 7.7 Create PR2 fields
- 7.8 Create the different tables
- 7.9 Write to database
- 8 Reference sequences
By default do not run any of the chunks when knitting
knitr::opts_chunk$set(eval=FALSE)
Load the variables common to the different scripts and the necessary libraries
source('PR2_init.R', echo=FALSE)
1 Main changes
Version 4.11.0
- Update prasinophytes clade VII taxonomy
2 References used
- Lopes dos Santos, A., Pollina, T., Gourvil, P., Corre, E., Marie, D., Garrido, J.L., Rodríguez, F. et al. 2017. Chloropicophyceae, a new class of picophytoplanktonic prasinophytes. Sci. Rep. 7:14019.
3 Read pr2
Note : Now use the local pr2 database to avoid connection problems
# Info for the local dabase
pr2_db <- db_info("pr2_local")
pr2_db_con <- db_connect(pr2_db)
pr2_main <- tbl(pr2_db_con, "pr2_main") %>% collect()
# pr2_main <- pr2_main %>% filter (is.na(removed_version))
pr2_seq <- tbl(pr2_db_con, "pr2_sequences")%>% collect()
pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy")
pr2_taxo <- pr2_taxo %>% filter (is.na(taxo_removed_version))%>% collect()
pr2_metadata <- tbl(pr2_db_con, "pr2_metadata")%>% collect()
# Join the tables and keep only sequences that are not removed
pr2 <- left_join(pr2_main, pr2_taxo, by = c("species"="species"))
pr2 <- left_join (pr2, pr2_seq)
pr2 <- left_join (pr2, pr2_metadata)
db_disconnect(pr2_db_con)
pr2 <- pr2 %>% filter (is.na(removed_version))
4 Prasinophytes clade VII
4.1 Define annotator
pr2.env$editor = "Lopes dos Santos A."
4.2 Export clade VII
pr2_clade7 <- pr2 %>%
filter(class=="Prasino-Clade-VII") %>%
select(pr2_accession, genbank_accession, start, end, kingdom:genus, species, sequence) %>%
arrange(species)
write_tsv(pr2_clade7, "../updates/2018 Prasino Clade VII/pr2_clade7.tsv", na="")
4.3 Define the file names
file_pr2_update <- "../updates/2018 Prasino Clade VII/PR2 prasinophytes clade VII 1.3.xlsx"
dir_pr2_update <- dirname(file_pr2_update)
file_pr2_update_taxo <- paste0(dir_pr2_update, "/taxo.txt")
file_pr2_update_new_features <- paste0(dir_pr2_update, "/features_new_sequences.txt")
file_pr2_not_updated <- paste0(dir_pr2_update, "/pr2_sequences_not_updated.txt")
file_pr2_update_new <- paste0(dir_pr2_update, "/pr2_update_new.txt")
4.4 Read the updated Excel file - Need to put guess_max at the max
pr2_update <- read_excel(file_pr2_update, sheet="pr2_update", col_names = TRUE, guess_max=200000)
5 Construct taxonomy and compare to existing taxonomy
5.1 Construct a taxonomy file to import into Excel (taxo.txt file)
- Check manually in the database what needs to be updated
# Construct the taxonomy file
pr2_update_taxo <- pr2_update %>%
group_by_(.dots=pr2.env$taxo_levels) %>%
summarise(sequence_number = n()) %>%
ungroup()
# Check it for duplicate names and duplicate parents
pr2_taxo_check (select(pr2_update_taxo, kingdom:species), dir_pr2_update)
# Compare to existing taxonomy
pr2_update_taxo_compared <- left_join (pr2_update_taxo, pr2_taxo, by = c("species"="species")) %>%
rename_ ("kingdom"="kingdom.x", "supergroup"="supergroup.x", "division" = "division.x",
"class" = "class.x", "order" = "order.x", "family"="family.x", "genus" = "genus.x") %>%
arrange(species)
# Write to a file to examine with Excel
write.table(pr2_update_taxo_compared,file=file_pr2_update_taxo, sep="\t", quote=FALSE, row.names=FALSE, na = "")
5.2 Update the entries that are already in the taxo table
- 1 species
- These entries have stuff in the taxo.y columns
- Note that the version and the editors have to be changed in the dv_function_pr2 file
- Note that the function update other species belonging to the same genera so that they have the same upstream taxonomy
pr2_taxo_update <- pr2_update_taxo_compared %>% filter(!(is.na(kingdom.y))) %>%
select(kingdom:species)
pr2_taxo_update <- pr2_taxo_update(pr2_taxo_update, method = "update")
# Check which lines did not work
filter( pr2_taxo_update, query_result==0)
5.3 Append the new entries
- 13 species
- All clade VII entries removed manually
- Other species from the same genera are also updated
pr2_taxo_add <- pr2_update_taxo_compared %>% filter(is.na(kingdom.y)) %>%
select(kingdom:species)
pr2_taxo_add <- pr2_taxo_update(pr2_taxo_add, method = "add")
# Check which lines did not work
filter(pr2_taxo_add, query_result==0)
5.4 Recheck the whole taxonomy table
pr2_taxo <- get_query(db_pr2, "select * from taxo")
pr2_taxo <- pr2_taxo %>% filter (is.na(taxo_removed_version))
pr2_taxo_check (select(pr2_taxo, kingdom:species), dir_pr2_update)
6 Old sequences
- 198 sequences updated
# Extract lines with accession number already in PR2, then one need just to update the species, nothing more
pr2_update_old <- pr2_update %>% filter(genbank_accession %in% pr2$genbank_accession)
pr2_update_old <- pr2_sequence_reassign(pr2_update_old)
7 New sequences
7.1 Extract lines with new accession number
- 104 sequences
pr2_update_new <- pr2_update %>% filter(!(genbank_accession %in% pr2$genbank_accession))
write_tsv(pr2_update_new, str_c(dir_pr2_update, "/pr2_update_new.txt"), na="")
7.2 Download the sequences and parse them
pr2_update_new_metadata <- genbank_download_parse(pr2_update_new$genbank_accession, pr2.env$genbank_directory)
write_tsv(pr2_update_new_metadata, str_c(dir_pr2_update,"/pr2_update_new_metadata.txt"), na="")
knitr::kable(head(pr2_update_new_metadata, 3))
7.3 Extract features
Features that can be used to determine the start and end of the 18S - This is stored in a file.
pr2_update_new_features <- genbank_features(pr2_update_new$genbank_accession, pr2.env$genbank_directory)
write_tsv(pr2_update_new_features,file_pr2_update_new_features, na = "")
7.4 Update sequence_start and sequence_end from feature table
Enter the start and end of the sequence into the pr2_update_new Excel sheet and reload Note : Not done for the Chloropicophyceae because no sequence with ITS region
# pr2_update_new <- read_excel(file_pr2_update, sheet="pr2_update_new", col_names = TRUE, guess_max=200000)
7.5 Merge the accession, taxonomy, sequence and metadata
pr2_update_new <-left_join(select(pr2_update_new, genbank_accession, start, end, species), pr2_update_new_metadata)
7.6 Extract sequences and get length and number of ambiguities
# Compute the start and end of sequence if not uploaded from the Excel file
pr2_update_new <- pr2_update_new %>%
mutate(start = case_when(is.na(start) ~ 1 ,
TRUE ~ start),
end = case_when(is.na(end) ~ sequence_length ,
TRUE ~ as.integer(end))) %>%
mutate (sequence = str_sub(sequence, start, end),
sequence_length_old = sequence_length)
# Update sequence length and ambiguities for all sequences since some may have been shortened
pr2_update_new$sequence_length <- str_length(pr2_update_new$sequence)
pr2_update_new$ambiguities <- str_count(pr2_update_new$sequence,pattern="[NRYSWKMBDHV]")
# Remove sequences that are shorter than minimum lenght (500) and those that have too many ambiguities
pr2_update_new_rejected <- pr2_update_new %>%
filter((sequence_length < pr2.env$sequence_length_min)|(ambiguities > pr2.env$ambiguities_max))
write_tsv( pr2_update_new_rejected, str_c(dir_pr2_update,"/pr2_update_new_rejected.txt"), na = "")
knitr::kable(pr2_update_new_rejected$genbank_accession)
pr2_update_new <- pr2_update_new %>% filter((sequence_length >= pr2.env$sequence_length_min) &
(ambiguities <= pr2.env$ambiguities_max))
7.7 Create PR2 fields
# pr2_update_new <- pr2_update_new %>%
# dplyr::rename(start=sequence_start, end=sequence_end)
pr2_update_new <- pr2_update_new %>%
mutate (label = "U",
added_version = pr2.env$version,
edited_by = pr2.env$editor,
pr2_accession = str_c(genbank_accession,".",start,".", end,"_",label),
reference_sequence = 0)
# Unquote the following three lines if Chimera or better write an "if statement" if species = Chimera_XXXX
# pr2_update_new$removed_version <- pr2_update_new$added_version
# pr2_update_new$chimera <- 1
# pr2_update_new$chimera_remark <- "Detected by B. Edvardsen by UCHIME or long branch/manual"
write_tsv(pr2_update_new,str_c(dir_pr2_update, "/pr2_update_new.txt"))
7.8 Create the different tables
pr2_main_update <- select(pr2_update_new, pr2_accession, genbank_accession, start,end, label, species, reference_sequence,
starts_with("added_"),starts_with("removed_"), starts_with("chimera"))
pr2_sequences_update <- select(pr2_update_new, pr2_accession, sequence, sequence_length, ambiguities)
pr2_metadata_update <- select(pr2_update_new,genbank_accession, starts_with("gb_"),starts_with("pr2_"), starts_with("eukref_"))
pr2_metadata_update <- select(pr2_metadata_update, -pr2_accession)
7.9 Write to database
Either the update can be done : * directly (but depends on Internet connection) - write_db * by exporting text file and then reimporting using the text file directly . The advantage of this method is that the data can be edited.
write_tsv(pr2_main_update,str_c(dir_pr2_update, "/pr2_main_update.txt"), na="")
write_tsv(pr2_sequences_update,str_c(dir_pr2_update, "/pr2_sequences_update.txt"), na="")
write_tsv(pr2_metadata_update,str_c(dir_pr2_update, "/pr2_metadata_update.txt"), na="")
db_append_records (pr2_db, table_name = "pr2_main", table_df = pr2_main_update )
db_append_records (pr2_db, table_name = "pr2_sequences", table_df = pr2_sequences_update )
db_append_records (pr2_db, table_name = "pr2_metadata", table_df = pr2_metadata_update )
8 Reference sequences
8.1 Find the references sequences
- 16 sequences
pr2_update_reference <- pr2_update %>% filter(!is.na(reference_sequence))
8.2 Build query to upgrade the fields
pr2_update_reference <- pr2_update_reference %>%
mutate(query = str_c("UPDATE pr2_main",
" SET reference_sequence = ", db_sql_escape(reference_sequence),
" WHERE genbank_accession = ",db_sql_escape(genbank_accession)))
#Execute query
pr2_update_reference$query_result <- db_execute_query_vector(pr2_db,pr2_update_reference$query)