PR2 version 4.11.0
Clean-up
By default do not run any of the chunks when knitting
knitr::opts_chunk$set(eval=FALSE)
Load the variables common to the different scripts and the necessary libraries
source('PR2_init.R', echo=FALSE)
1 Main changes
Version 4.11.0
- Database is now stored locally (“pr2_local”)
- Remove sequences shorter than 500 bp - 173 sequences affected
- Correct PR2 accession numbers, start, end and label - 1170 sequences affected
- GenBank entries with 2 PR2 sequences and different taxonomy - 4 sequences corrected and 2 removed (AF245249, AY706334, FJ848510, JF276416, KM020045, KM020071)
- ES339670.68.1270_U remove because it was a mRNA sequence (still wonderning what it was doing there…)
- Sequences labelled as bad quality by EukRef
- Remove the one that are bad quality
- Annotate the chimeras
2 Read pr2
pr2_db_con <- db_connect(db_info("pr2_local"))
pr2_main <- tbl(pr2_db_con, "pr2_main") %>% collect()
# pr2_main <- pr2_main %>% filter (is.na(removed_version))
pr2_seq <- tbl(pr2_db_con, "pr2_sequences") %>% collect()
pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy")
pr2_taxo <- pr2_taxo %>% filter(is.na(taxo_removed_version)) %>% collect()
pr2_metadata <- tbl(pr2_db_con, "pr2_metadata") %>% collect()
# Join the tables and keep only sequences that are not removed
pr2 <- left_join(pr2_main, pr2_taxo, by = c(species = "species"))
pr2 <- left_join(pr2, pr2_seq)
pr2 <- left_join(pr2, pr2_metadata)
db_disconnect(pr2_db_con)
pr2 <- pr2 %>% filter(is.na(removed_version))
3 Remove sequences that have less than 500 bp
- Issue # 7
- Result : 173 sequences found and removed
pr2_short <- pr2 %>% filter(sequence_length < pr2.env$sequence_length_min)
pr2_short <- pr2_short %>% mutate(query_main = str_c("UPDATE pr2_main SET ",
dvutils::db_update_field_string("removed_version", pr2.env$version, TRUE),
dvutils::db_update_field_string("edited_by", pr2.env$editor, TRUE), dvutils::db_update_field_string("edited_remark",
"length of sequence too short (<500 bp)", FALSE), " WHERE ", dvutils::db_update_field_string("pr2_accession",
pr2_accession, FALSE)))
pr2_short$query_main_result <- dvutils::db_execute_query_vector(db_info("pr2"),
pr2_short$query_main)
4 Fix PR2 accessions that do not follow the naming rule
- Issue # 4
- A few PR2 id do not follow the standard rule for constructing PR2 id (Genbank.start.end_X). This needs to be corrected (see sequences id 176963 -> 178133 in particular).
- Results : 1170 sequences to correct
pr2_bad_accession <- pr2 %>% filter(!str_detect(pr2_accession, "\\.")
pr2_bad_accession <- pr2_bad_accession %>%
mutate(pr2_accession_new = str_c(genbank_accession,".1.",sequence_length,"_U")) %>%
mutate(query_main = str_c("UPDATE pr2_main SET ",
dvutils::db_update_field_string("pr2_accession", pr2_accession_new, FALSE),
" WHERE ",
dvutils::db_update_field_string("pr2_accession", pr2_accession, FALSE)),
query_sequences = str_c("UPDATE pr2_sequences SET ",
dvutils::db_update_field_string("pr2_accession", pr2_accession_new, FALSE),
" WHERE ",
dvutils::db_update_field_string("pr2_accession", pr2_accession, FALSE))
)
pr2_bad_accession$query_main_result <- dvutils::db_execute_query_vector (db_info("pr2"), pr2_bad_accession$query_main)
pr2_bad_accession$query_sequences_result <- dvutils::db_execute_query_vector (db_info("pr2"), pr2_bad_accession$query_sequences)
5 Remove sequence labelled as low quality or chimera by Eukref
Data from https://github.com/eukref/curation/edit/master/EUKREF_low_quality_sequences.txt
5.1 Read the Excel
- Remove the Genbank version #
- 748 sequences
eukref_low_qual <- read_excel("../EukRef/Eukref low quality.xlsx") %>% mutate(genbank_accession = str_replace(genbank_accession,
"[.][0-9]+", ""))
# Summarize the bad quality sequences
eukref_low_qual %>% group_by(remark_quality) %>% summarise(n_seq = n())
# A tibble: 4 x 2
remark_quality n_seq
<chr> <int>
1 chimera(manual) 86
2 chimera(uchime) 403
3 dubious(manual) 6
4 low_quality(manual) 253
5.2 Find the sequences that are in PR2
- 408 low quality sequences in PR2
pr2_low_qual <- left_join(eukref_low_qual, pr2) %>% filter(!is.na(pr2_accession))
pr2_remove <- pr2_low_qual %>% filter(str_detect(remark_quality, "low_quality|dubious"))
glue::glue("Number of sequences of bad quality : {nrow(pr2_remove)}")
Number of sequences of bad quality : 125
pr2_chimeras <- pr2_low_qual %>% filter(str_detect(remark_quality, "chimera"))
glue::glue("Number of chimeras : {nrow(pr2_chimeras)}")
Number of chimeras : 283
5.3 Update database
First remove the bad quality sequences
pr2_remove <- pr2_remove %>% mutate(query = str_c("UPDATE pr2_main", " SET removed_version = ",
db_sql_escape(pr2.env$version), ", edited_by = ", db_sql_escape(pr2.env$editor),
", edited_remark= ", db_sql_escape(str_c("Eukref - ", remark_quality)),
" WHERE pr2_accession = ", db_sql_escape(pr2_accession)))
# Execute query
pr2_remove$query_result <- db_execute_query_vector(pr2_db, pr2_remove$query)
Then annotate the chimeras
pr2_chimeras <- pr2_chimeras %>% mutate(query = str_c("UPDATE pr2_main", " SET removed_version = ",
db_sql_escape(pr2.env$version), ", edited_by = ", db_sql_escape(pr2.env$editor),
", chimera = ", db_sql_escape(1), ", chimera_remark= ", db_sql_escape(str_c("Eukref - ",
remark_quality)), " WHERE pr2_accession = ", db_sql_escape(pr2_accession)))
# Execute query
pr2_chimeras$query_result <- db_execute_query_vector(pr2_db, pr2_chimeras$query)