library(knitr)
library(rmdformats)
library(kableExtra)
knitr::opts_chunk$set(fig.width=8,
fig.height=6,
eval=FALSE,
cache=TRUE,
echo=TRUE,
prompt=FALSE,
tidy=TRUE,
comment=NA,
message=FALSE,
warning=FALSE)
opts_knit$set(width=90)
options(max.print="300")
1 Goal
Version : 4.10.0
Use the initial database saved by Laure
- fix sequences that are too long (probably a problem introduced by Richard)
- label sequences that are chimeras
- label reference sequences
2 Initialize
2.1 Load libraries and functions
source("C:/Users/vaulot/Google Drive/Scripts/R library/dv_function_pr2.R")
library(dada2)
2.2 Initialize and read pr2
pr2_main <- get_query(db_pr2, "select * from pr2_main")
pr2_main <- pr2_main %>% filter(is.na(removed_version))
pr2_seq <- get_query(db_pr2, "select * from pr2_sequences")
pr2_taxo <- get_query(db_pr2, "select * from taxo")
pr2_taxo <- pr2_taxo %>% filter(is.na(taxo_removed_version))
pr2_metadata <- get_query(db_pr2, "select * from pr2_metadata")
# Join the tables and keep only sequences that are not removed
pr2 <- left_join(pr2_main, pr2_taxo, by = c(species = "species"))
pr2 <- left_join(pr2, pr2_seq)
pr2 <- left_join(pr2, pr2_metadata)
2.3 Define version and annotators
pr2_version = "4.10.0" # Note that this is now a string
version_directory <- paste0("C:/Data Biomol/RNA/_PR2/versions/", pr2_version,
"/")
pr2_editor = "Vaulot D."
3 Incorporate data from Laure original file
File is : PR2_original.xlsx
Also define a number of file names to be used latter
# Read the Excel file - Need to put guess_max at the max
file_pr2_original <- "C:/Data Biomol/RNA/_PR2/updates/2011 PR2 original/PR2_original.xlsx"
pr2_original <- read_excel(file_pr2_original, col_names = TRUE, guess_max = 2e+05)
dir_pr2_original <- dirname(file_pr2_original)
3.1 Find sequences for which the sequence length does not match the difference end - begin +1
3.1.1 Sequences that are too long
- Find the sequences that are too long
Result : 1106 sequences
pr2_seq_too_long <- pr2
pr2_seq_too_long$start_end <- pr2$end - pr2$start + 1
pr2_seq_too_long <- pr2_seq_too_long %>% filter(sequence_length > start_end) %>%
select(pr2_accession, genbank_accession, label, division, class, species,
added_version, start, end, start_end, sequence_length, sequence) %>%
arrange(division, class, species)
- Is the sequence in the original PR2 database ?
In this case the sequence is taken from the old database
pr2_seq_too_long <- pr2_seq_too_long %>% left_join(transmute(pr2_original, pr0_genbank_accession,
pr0_pr2_accession, pr0_seq1, pr0_seq1_length = str_length(pr0_seq1)), by = c(genbank_accession = "pr0_genbank_accession"))
pr2_seq_too_long$sequence_new <- pr2_seq_too_long$pr0_seq1
- How many sequences remain to be reimported
Result : 528
pr2_seq_too_long_to_get <- pr2_seq_too_long %>% filter(is.na(sequence_new))
- Find sequences that are from genome sequences (in this case the 3rd character of the accession number if a letter)
These sequences will NOT be updated and removed from PR2
Result : 4 sequences
pr2_seq_too_long_to_get_genome <- pr2_seq_too_long_to_get %>% filter(str_detect(str_sub(genbank_accession,
3, 4), "[A-Z]"))
- For all these sequences that are not in the original PR2, the sequences are re-extracted from the GenBank sequence.
Result : 524 sequences
pr2_seq_too_long_to_get <- pr2_seq_too_long_to_get %>% filter(!str_detect(str_sub(genbank_accession,
3, 4), "[A-Z]"))
pr2_seq_too_long_to_get <- genbank_download_parse(pr2_seq_too_long_to_get$genbank_accession,
genbank_directory)
pr2_seq_too_long <- pr2_seq_too_long %>% left_join(transmute(pr2_seq_too_long_to_get,
genbank_accession, sequence_gb = sequence))
- Divide into 3 sets
Set A : sequences from Laure PR2 - 578 sequences
pr2_seq_too_long_set_A <- pr2_seq_too_long %>% filter(!is.na(sequence_new))
Set B : sequences from GenBank - 524 sequences
pr2_seq_too_long_set_B <- pr2_seq_too_long %>% filter(!is.na(sequence_gb))
pr2_seq_too_long_set_B <- pr2_seq_too_long_set_B %>% mutate(sequence_new = str_sub(sequence_gb,
start, end))
Join Set A and B - 1102 sequences
pr2_seq_too_long_set_AB <- bind_rows(pr2_seq_too_long_set_A, pr2_seq_too_long_set_B) %>%
mutate(sequence_length_new = str_length(sequence_new), ambiguities_new = str_count(sequence_new,
pattern = "[NRYSWKMBDHV]"))
# Check that all sequences pass the sequence contrainsts..
pr2_seq_too_long_set_AB_exclude <- pr2_seq_too_long_set_AB %>% filter((sequence_length_new <
sequence_length_min) & (ambiguities_new > ambiguities_max))
Now create queries to update the tables
pr2_seq_too_long_set_AB <- pr2_seq_too_long_set_AB %>% mutate(query_seq = paste0("UPDATE pr2_sequences SET ",
update_field_string("sequence", sequence_new, TRUE), update_field_string("sequence_length",
sequence_length_new, TRUE), update_field_string("ambiguities", ambiguities_new,
FALSE), " WHERE ", update_field_string("pr2_accession", pr2_accession,
FALSE)), query_main = paste0("UPDATE pr2_main SET ", update_field_string("edited_version",
pr2_version, TRUE), update_field_string("edited_by", pr2_editor, TRUE),
update_field_string("edited_remark", "length of sequence fixed (before too long)",
FALSE), " WHERE ", update_field_string("pr2_accession", pr2_accession,
FALSE)))
pr2_seq_too_long_set_AB$query_seq_result <- execute_query_vector(db_pr2, pr2_seq_too_long_set_AB$query_seq)
pr2_seq_too_long_set_AB$query_main_result <- execute_query_vector(db_pr2, pr2_seq_too_long_set_AB$query_main)
Set C : No sequences - 6 sequences - These sequences are removed from the database
pr2_seq_too_long_set_C <- pr2_seq_too_long %>% filter(is.na(sequence_gb) & is.na(sequence_new))
pr2_seq_too_long_set_C <- pr2_seq_too_long_set_C %>% mutate(query_main = paste0("UPDATE pr2_main SET ",
update_field_string("removed_version", pr2_version, TRUE), update_field_string("edited_by",
pr2_editor, TRUE), update_field_string("edited_remark", "length of sequence too long",
FALSE), " WHERE ", update_field_string("pr2_accession", pr2_accession,
FALSE)))
pr2_seq_too_long_set_C$query_main_result <- execute_query_vector(db_pr2, pr2_seq_too_long_set_C$query_main)
3.1.2 Sequences that are too short
- Find the sequences that are too short.
Note it is normal that sequences with “UC” and “R” are shorter because intron has been removed.
Result : 1681 sequences
pr2_seq_too_short <- pr2
pr2_seq_too_short$start_end <- pr2$end - pr2$start + 1
pr2_seq_too_short <- pr2_seq_too_short %>% filter(sequence_length < start_end) %>%
filter(!(label %in% c("UC", "R"))) %>% select(pr2_accession, genbank_accession,
label, division, class, species, added_version, start, end, start_end, sequence_length,
sequence) %>% arrange(division, class, species)
- For all the sequences, a remark added in the pr2_main table
pr2_seq_too_short <- pr2_seq_too_short %>% mutate(query_main = paste0("UPDATE pr2_main SET ",
update_field_string("edited_version", pr2_version, TRUE), update_field_string("edited_by",
pr2_editor, TRUE), update_field_string("edited_remark", "length of sequence too short compared to end-start+1",
FALSE), " WHERE ", update_field_string("pr2_accession", pr2_accession,
FALSE)))
pr2_seq_too_short$query_main_result <- execute_query_vector(db_pr2, pr2_seq_too_short$query_main)
3.2 Sequences labelled as reference in Laure PR2 Original database
- Filter the original sequences
Result : 23 194
pr2_original_reference <- pr2_original %>% filter(pr0_reference_sequence ==
"Referenced sequence")
- Merge with pr2, emoving sequences that already labelled with reference_sequence = 1
Result : 21 537
pr2_reference <- pr2 %>% filter(genbank_accession %in% pr2_original_reference$pr0_genbank_accession) %>%
filter(is.na(reference_sequence))
- Update database
pr2_reference <- pr2_reference %>% mutate(query_main = paste0("UPDATE pr2_main SET ",
update_field_string("edited_version", pr2_version, TRUE), update_field_string("edited_by",
"Guillou L.", TRUE), update_field_string("remark", "reference sequence in original PR2 database",
TRUE), update_field_string("reference_sequence", 1, FALSE), " WHERE ",
update_field_string("pr2_accession", pr2_accession, FALSE)))
pr2_reference$query_main_result <- execute_query_vector(db_pr2, pr2_reference$query_main)
3.3 Sequences labelled as chimera in Laure PR2 Original database
- Filter the original sequences
Result : 1139
pr2_original_chimera <- pr2_original %>% filter(pr0_reference_sequence == "Chimera OK")
- Merge with pr2, removing sequences that already labelled with reference_sequence = 1
Result : 7
pr2_chimera <- pr2 %>% filter(genbank_accession %in% pr2_original_chimera$pr0_genbank_accession) %>%
filter(is.na(reference_sequence))
- Update database
pr2_chimera <- pr2_chimera %>% mutate(query_main = paste0("UPDATE pr2_main SET ",
update_field_string("removed_version", pr2_version, TRUE), update_field_string("edited_by",
"Guillou L.", TRUE), update_field_string("chimera_remark", "chimera in original PR2 database",
TRUE), update_field_string("chimera", 1, FALSE), " WHERE ", update_field_string("pr2_accession",
pr2_accession, FALSE)))
pr2_chimera$query_main_result <- execute_query_vector(db_pr2, pr2_chimera$query_main)
3.4 Save the files of this version
source("C:/Users/vaulot/Google Drive/Scripts/R library/dv_function_pr2.R")
pr2_export_all()