PR2 version 4.11.0
Chlorophyta - Mamiellophyceae
PR2 version 4.11.0
Chlorophyta - Mamiellophyceae
- 1 Main changes
- 2 References used
- 3 Read pr2
- 4 Mamiellophyceae
- 5 Construct taxonomy and compare to existing taxonomy
- 6 Old sequences
- 7 New sequences
- 7.1 Extract lines with new accession number
- 7.2 Download the new sequences and parse them
- 7.3 Extract features
- 7.4 Update sequence_start and sequence_end from feature table
- 7.5 Merge the accession, taxonomy, sequence and metadata
- 7.6 Extract sequences and get length and number of ambiguities
- 7.7 Create PR2 fields
- 7.8 Create the different tables
- 7.9 Write to database
- 8 Reference sequences
By default do not run any of the chunks when knitting
knitr::opts_chunk$set(eval=FALSE)
Load the variables common to the different scripts and the necessary libraries
source('PR2_init.R', echo=FALSE)
1 Main changes
Version 4.11.0
- Update Mamiellophyceae taxonomy
2 References used
- Tragin, M. & Vaulot, D. 2018. Novel diversity within marine Mamiellophyceae (Chlorophyta) unveiled by metabarcoding. bioRxiv.
3 Read pr2
Note : Now use the local pr2 database to avoid connection problems
# Info for the local dabase
pr2_db <- db_info("pr2_local")
pr2_db_con <- db_connect(pr2_db)
pr2_main <- tbl(pr2_db_con, "pr2_main") %>% collect()
# pr2_main <- pr2_main %>% filter (is.na(removed_version))
pr2_seq <- tbl(pr2_db_con, "pr2_sequences")%>% collect()
pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy")
pr2_taxo <- pr2_taxo %>% filter (is.na(taxo_removed_version))%>% collect()
pr2_metadata <- tbl(pr2_db_con, "pr2_metadata")%>% collect()
# Join the tables and keep only sequences that are not removed
pr2 <- left_join(pr2_main, pr2_taxo, by = c("species"="species"))
pr2 <- left_join (pr2, pr2_seq)
pr2 <- left_join (pr2, pr2_metadata)
db_disconnect(pr2_db_con)
pr2 <- pr2 %>% filter (is.na(removed_version))
4 Mamiellophyceae
4.1 Define annotator
pr2.env$editor = "Vaulot D."
4.2 Define the file names
dir_pr2_update <- "../updates/2018 Mamiello/"
add_dir <- function(file_name) str_c(dir_pr2_update, file_name)
file_pr2_update <- add_dir("PR2 Mamiello.xlsx")
file_pr2_update_taxo <- add_dir("taxo.txt")
file_pr2_update_new_features <- add_dir("pr2_new_sequences_features.txt")
file_pr2_not_updated <- add_dir("pr2_sequences_not_updated.txt")
file_pr2_update_new <- add_dir("pr2_update_new.txt")
4.3 Export Existing sequences
pr2_selected <- pr2 %>%
filter(genus %in% c("Micromonas", "Ostreococcus", "Mantoniella"))%>%
arrange(species)
pr2_selected_excel <- pr2_selected %>%
select(pr2_accession, genbank_accession, start, end, kingdom:genus, species,gb_strain, gb_clone)
write_tsv(pr2_selected_excel, str_c(dir_pr2_update, "/pr2_mamiello.txt"), na="")
pr2_export(pr2_selected, str_c(dir_pr2_update, "/pr2_mamiello.fasta.gz"), file_type = "fasta" , file_format = "fasta_taxo_short")
pr2_export(pr2_selected, str_c(dir_pr2_update, "/pr2_mamiello_dada2.fasta.gz"), file_type = "fasta" , file_format = "dada2")
4.4 Assign the sequences
Strategy
- Principle: Use the Wang assigner as implementerd in dada2
- Use as reference the alignement with sequences and barcodes from Margot OSD paper
- It is necessary to edit the sequences names to be compatible with dada2
>species_name
(do not forget ; at the end) - The pr2 sequences are aligned with the refernce sequence and the V4 region is extracted.
- Run the dada2 routine for assignement
- Check the sequences which have only a part of the V4 region, in general they cannot be assigned
- Any sequence for which the bootstrap value is below 60 should be checked manually
Note : the extraction of the V4 region could be done using primers (could be a problem for sequences starting after V4 region)
dada2_assign(add_dir("mantoniella_pr2_V4.fasta"), ref_file_name = add_dir("mantoniella_ref.fasta"),tax_levels = c("genus","species"))
dada2_assign(add_dir("ostreococcus_pr2_V4.fasta"), ref_file_name = add_dir("ostreococcus_ref.fasta"),tax_levels = c("species"))
dada2_assign(add_dir("micromonas_pr2_V4.fasta"), ref_file_name = add_dir("micromonas_ref.fasta"),tax_levels = c("species"))
4.5 Read the updated Excel file - Need to put guess_max at the max
pr2_update <- read_excel(file_pr2_update, sheet="pr2_update", col_names = TRUE, guess_max=200000)
# Only keep the first columns
pr2_update <- pr2_update %>% select(pr2_accession:genus)
5 Construct taxonomy and compare to existing taxonomy
5.1 Construct a taxonomy file to import into Excel (taxo.txt file)
- Check manually in the database what needs to be updated
# Construct the taxonomy file
pr2_update_taxo <- pr2_update %>%
group_by_(.dots=pr2.env$taxo_levels) %>%
summarise(sequence_number = n()) %>%
ungroup()
# Check it for duplicate names and duplicate parents
pr2_taxo_check (select(pr2_update_taxo, kingdom:species), dir_pr2_update)
# Compare to existing taxonomy
pr2_update_taxo_compared <- left_join (pr2_update_taxo, pr2_taxo, by = c("species"="species")) %>%
rename_ ("kingdom"="kingdom.x", "supergroup"="supergroup.x", "division" = "division.x",
"class" = "class.x", "order" = "order.x", "family"="family.x", "genus" = "genus.x") %>%
arrange(species)
# Write to a file to examine with Excel
write_tsv(pr2_update_taxo_compared, file_pr2_update_taxo, na = "")
5.2 Update the entries that are already in the taxo table
- 1 species
- These entries have stuff in the taxo.y columns
- Note that the version and the editors have to be changed in the dv_function_pr2 file
- Note that the function update other species belonging to the same genera so that they have the same upstream taxonomy
pr2_taxo_update <- pr2_update_taxo_compared %>% filter(!(is.na(kingdom.y))) %>%
select(kingdom:species)
pr2_taxo_update <- pr2_taxo_update(pr2_taxo_update, method = "update")
# Check which lines did not work
filter( pr2_taxo_update, query_result==0)
5.3 Append the new entries
- 14 species or clades
- All old entries removed manually
pr2_taxo_add <- pr2_update_taxo_compared %>% filter(is.na(kingdom.y)) %>%
select(kingdom:species)
pr2_taxo_add <- pr2_taxo_update(pr2_taxo_add, method = "add")
# Check which lines did not work (if works query_result = 1)
filter(pr2_taxo_add, query_result==0)
5.4 Recheck the whole taxonomy table
pr2_taxo <- get_query(db_pr2, "select * from taxo")
pr2_taxo <- pr2_taxo %>% filter (is.na(taxo_removed_version))
pr2_taxo_check (select(pr2_taxo, kingdom:species), dir_pr2_update)
6 Old sequences
- 872 sequences updated
# Extract lines with accession number already in PR2, then one need just to update the species, nothing more
pr2_update_old <- pr2_update %>% filter(genbank_accession %in% pr2$genbank_accession)
pr2_update_old <- pr2_sequence_reassign(pr2_update_old)
7 New sequences
7.1 Extract lines with new accession number
- 8 sequences
pr2_update_new <- pr2_update %>% filter(!(genbank_accession %in% pr2$genbank_accession))
write_tsv(pr2_update_new, str_c(dir_pr2_update, "/pr2_update_new.txt"), na="")
7.2 Download the new sequences and parse them
pr2_update_new_metadata <- genbank_download_parse(pr2_update_new$genbank_accession, pr2.env$genbank_directory)
write_tsv(pr2_update_new_metadata, str_c(dir_pr2_update,"/pr2_update_new_metadata.txt"), na="")
head(pr2_update_new_metadata, 10)
7.3 Extract features
Features that can be used to determine the start and end of the 18S - This is stored in a file.
pr2_update_new_features <- genbank_features(pr2_update_new$genbank_accession, pr2.env$genbank_directory)
write_tsv(pr2_update_new_features,file_pr2_update_new_features, na = "")
7.4 Update sequence_start and sequence_end from feature table
Enter the start and end of the sequence into the pr2_update_new Excel sheet and reload Note : Not done because no sequence with ITS region
# pr2_update_new <- read_excel(file_pr2_update, sheet="pr2_update_new", col_names = TRUE, guess_max=200000)
7.5 Merge the accession, taxonomy, sequence and metadata
pr2_update_new <-left_join(select(pr2_update_new, genbank_accession, start, end, species), pr2_update_new_metadata)
# Remove sequences which are not yet in Genbank
pr2_update_new <- pr2_update_new %>% filter(!is.na(gb_definition))
7.6 Extract sequences and get length and number of ambiguities
3 sequences are too short - 5 sequences remaining
# Compute the start and end of sequence if not uploaded from the Excel file
pr2_update_new <- pr2_update_new %>%
mutate(start = case_when(is.na(start) ~ 1 ,
TRUE ~ start),
end = case_when(is.na(end) ~ sequence_length ,
TRUE ~ as.integer(end))) %>%
mutate (sequence = str_sub(sequence, start, end),
sequence_length_old = sequence_length)
# Update sequence length and ambiguities for all sequences since some may have been shortened
pr2_update_new$sequence_length <- str_length(pr2_update_new$sequence)
pr2_update_new$ambiguities <- str_count(pr2_update_new$sequence,pattern="[NRYSWKMBDHV]")
# Remove sequences that are shorter than minimum lenght (500) and those that have too many ambiguities
pr2_update_new_rejected <- pr2_update_new %>%
filter((sequence_length < pr2.env$sequence_length_min)|(ambiguities > pr2.env$ambiguities_max))
write_tsv( pr2_update_new_rejected, str_c(dir_pr2_update,"/pr2_update_new_rejected.txt"), na = "")
pr2_update_new_rejected$genbank_accession
pr2_update_new <- pr2_update_new %>% filter((sequence_length >= pr2.env$sequence_length_min) &
(ambiguities <= pr2.env$ambiguities_max))
7.7 Create PR2 fields
# pr2_update_new <- pr2_update_new %>%
# dplyr::rename(start=sequence_start, end=sequence_end)
pr2_update_new <- pr2_update_new %>%
mutate (label = "U",
added_version = pr2.env$version,
edited_by = pr2.env$editor,
pr2_accession = str_c(genbank_accession,".",start,".", end,"_",label))
# Add reference_sequence = 1 if the sequence are reference sequences
# Unquote the following three lines if Chimera or better write an "if statement" if species = Chimera_XXXX
# pr2_update_new$removed_version <- pr2_update_new$added_version
# pr2_update_new$chimera <- 1
# pr2_update_new$chimera_remark <- "Detected by B. Edvardsen by UCHIME or long branch/manual"
write_tsv(pr2_update_new,str_c(dir_pr2_update, "/pr2_update_new.txt"))
7.8 Create the different tables
pr2_main_update <- select(pr2_update_new, pr2_accession, genbank_accession, start,end, label, species,
starts_with("added_"),starts_with("removed_"), starts_with("chimera"))
pr2_sequences_update <- select(pr2_update_new, pr2_accession, sequence, sequence_length, ambiguities)
pr2_metadata_update <- select(pr2_update_new,genbank_accession, starts_with("gb_"),starts_with("pr2_"), starts_with("eukref_"))
pr2_metadata_update <- select(pr2_metadata_update, -pr2_accession)
7.9 Write to database
The update can be done either : * directly (but depends on Internet connection) - write_db * by exporting text file and then reimporting using the text file directly . The advantage of this method is that the data can be edited.
write_tsv(pr2_main_update,str_c(dir_pr2_update, "/pr2_main_update.txt"), na="")
write_tsv(pr2_sequences_update,str_c(dir_pr2_update, "/pr2_sequences_update.txt"), na="")
write_tsv(pr2_metadata_update,str_c(dir_pr2_update, "/pr2_metadata_update.txt"), na="")
db_append_records (pr2_db, table_name = "pr2_main", table_df = pr2_main_update )
db_append_records (pr2_db, table_name = "pr2_sequences", table_df = pr2_sequences_update )
db_append_records (pr2_db, table_name = "pr2_metadata", table_df = pr2_metadata_update )
8 Reference sequences
8.1 Find the references sequences
- 45 sequences
pr2_update_reference <- pr2_update %>% filter(!is.na(reference_sequence))
8.2 Build query to upgrade the fields
pr2_update_reference <- pr2_update_reference %>%
mutate(query = str_c("UPDATE pr2_main",
" SET reference_sequence = ", db_sql_escape(reference_sequence),
" WHERE genbank_accession = ",db_sql_escape(genbank_accession)))
#Execute query
pr2_update_reference$query_result <- db_execute_query_vector(pr2_db,pr2_update_reference$query)