library(knitr)
  library(rmdformats)
  library(kableExtra) 

  knitr::opts_chunk$set(fig.width=8, 
                        fig.height=6, 
                        eval=FALSE, 
                        cache=TRUE,
                        echo=TRUE,
                        prompt=FALSE,
                        tidy=TRUE,
                        comment=NA,
                        message=FALSE,
                        warning=FALSE)
  opts_knit$set(width=90)
  options(max.print="300")  

1 Goal

Version : 4.10.0

Use the initial database saved by Laure

  • fix sequences that are too long (probably a problem introduced by Richard)
  • label sequences that are chimeras
  • label reference sequences

2 Initialize

2.1 Load libraries and functions

source("C:/Users/vaulot/Google Drive/Scripts/R library/dv_function_pr2.R")
library(dada2)

2.2 Initialize and read pr2

pr2_main <- get_query(db_pr2, "select * from pr2_main")
pr2_main <- pr2_main %>% filter(is.na(removed_version))

pr2_seq <- get_query(db_pr2, "select * from pr2_sequences")

pr2_taxo <- get_query(db_pr2, "select * from taxo")
pr2_taxo <- pr2_taxo %>% filter(is.na(taxo_removed_version))
pr2_metadata <- get_query(db_pr2, "select * from pr2_metadata")

# Join the tables and keep only sequences that are not removed

pr2 <- left_join(pr2_main, pr2_taxo, by = c(species = "species"))
pr2 <- left_join(pr2, pr2_seq)
pr2 <- left_join(pr2, pr2_metadata)

2.3 Define version and annotators

pr2_version = "4.10.0"  # Note that this is now a string
version_directory <- paste0("C:/Data Biomol/RNA/_PR2/versions/", pr2_version, 
    "/")
pr2_editor = "Vaulot D."

3 Incorporate data from Laure original file

File is : PR2_original.xlsx

Also define a number of file names to be used latter

# Read the Excel file - Need to put guess_max at the max
file_pr2_original <- "C:/Data Biomol/RNA/_PR2/updates/2011 PR2 original/PR2_original.xlsx"
pr2_original <- read_excel(file_pr2_original, col_names = TRUE, guess_max = 2e+05)

dir_pr2_original <- dirname(file_pr2_original)

3.1 Find sequences for which the sequence length does not match the difference end - begin +1

3.1.1 Sequences that are too long

  1. Find the sequences that are too long
    Result : 1106 sequences
pr2_seq_too_long <- pr2
pr2_seq_too_long$start_end <- pr2$end - pr2$start + 1
pr2_seq_too_long <- pr2_seq_too_long %>% filter(sequence_length > start_end) %>% 
    select(pr2_accession, genbank_accession, label, division, class, species, 
        added_version, start, end, start_end, sequence_length, sequence) %>% 
    arrange(division, class, species)
  1. Is the sequence in the original PR2 database ?

In this case the sequence is taken from the old database

pr2_seq_too_long <- pr2_seq_too_long %>% left_join(transmute(pr2_original, pr0_genbank_accession, 
    pr0_pr2_accession, pr0_seq1, pr0_seq1_length = str_length(pr0_seq1)), by = c(genbank_accession = "pr0_genbank_accession"))
pr2_seq_too_long$sequence_new <- pr2_seq_too_long$pr0_seq1
  1. How many sequences remain to be reimported
    Result : 528
pr2_seq_too_long_to_get <- pr2_seq_too_long %>% filter(is.na(sequence_new))
  1. Find sequences that are from genome sequences (in this case the 3rd character of the accession number if a letter)
    These sequences will NOT be updated and removed from PR2
    Result : 4 sequences
pr2_seq_too_long_to_get_genome <- pr2_seq_too_long_to_get %>% filter(str_detect(str_sub(genbank_accession, 
    3, 4), "[A-Z]"))
  1. For all these sequences that are not in the original PR2, the sequences are re-extracted from the GenBank sequence.
    Result : 524 sequences
pr2_seq_too_long_to_get <- pr2_seq_too_long_to_get %>% filter(!str_detect(str_sub(genbank_accession, 
    3, 4), "[A-Z]"))

pr2_seq_too_long_to_get <- genbank_download_parse(pr2_seq_too_long_to_get$genbank_accession, 
    genbank_directory)

pr2_seq_too_long <- pr2_seq_too_long %>% left_join(transmute(pr2_seq_too_long_to_get, 
    genbank_accession, sequence_gb = sequence))
  1. Divide into 3 sets

Set A : sequences from Laure PR2 - 578 sequences

pr2_seq_too_long_set_A <- pr2_seq_too_long %>% filter(!is.na(sequence_new))

Set B : sequences from GenBank - 524 sequences

pr2_seq_too_long_set_B <- pr2_seq_too_long %>% filter(!is.na(sequence_gb))
pr2_seq_too_long_set_B <- pr2_seq_too_long_set_B %>% mutate(sequence_new = str_sub(sequence_gb, 
    start, end))

Join Set A and B - 1102 sequences

pr2_seq_too_long_set_AB <- bind_rows(pr2_seq_too_long_set_A, pr2_seq_too_long_set_B) %>% 
    mutate(sequence_length_new = str_length(sequence_new), ambiguities_new = str_count(sequence_new, 
        pattern = "[NRYSWKMBDHV]"))
# Check that all sequences pass the sequence contrainsts..
pr2_seq_too_long_set_AB_exclude <- pr2_seq_too_long_set_AB %>% filter((sequence_length_new < 
    sequence_length_min) & (ambiguities_new > ambiguities_max))

Now create queries to update the tables

pr2_seq_too_long_set_AB <- pr2_seq_too_long_set_AB %>% mutate(query_seq = paste0("UPDATE pr2_sequences SET ", 
    update_field_string("sequence", sequence_new, TRUE), update_field_string("sequence_length", 
        sequence_length_new, TRUE), update_field_string("ambiguities", ambiguities_new, 
        FALSE), " WHERE ", update_field_string("pr2_accession", pr2_accession, 
        FALSE)), query_main = paste0("UPDATE pr2_main SET ", update_field_string("edited_version", 
    pr2_version, TRUE), update_field_string("edited_by", pr2_editor, TRUE), 
    update_field_string("edited_remark", "length of sequence fixed (before too long)", 
        FALSE), " WHERE ", update_field_string("pr2_accession", pr2_accession, 
        FALSE)))

pr2_seq_too_long_set_AB$query_seq_result <- execute_query_vector(db_pr2, pr2_seq_too_long_set_AB$query_seq)
pr2_seq_too_long_set_AB$query_main_result <- execute_query_vector(db_pr2, pr2_seq_too_long_set_AB$query_main)

Set C : No sequences - 6 sequences - These sequences are removed from the database

pr2_seq_too_long_set_C <- pr2_seq_too_long %>% filter(is.na(sequence_gb) & is.na(sequence_new))
pr2_seq_too_long_set_C <- pr2_seq_too_long_set_C %>% mutate(query_main = paste0("UPDATE pr2_main SET ", 
    update_field_string("removed_version", pr2_version, TRUE), update_field_string("edited_by", 
        pr2_editor, TRUE), update_field_string("edited_remark", "length of sequence too long", 
        FALSE), " WHERE ", update_field_string("pr2_accession", pr2_accession, 
        FALSE)))
pr2_seq_too_long_set_C$query_main_result <- execute_query_vector(db_pr2, pr2_seq_too_long_set_C$query_main)

3.1.2 Sequences that are too short

  1. Find the sequences that are too short.
    Note it is normal that sequences with “UC” and “R” are shorter because intron has been removed.
    Result : 1681 sequences
pr2_seq_too_short <- pr2
pr2_seq_too_short$start_end <- pr2$end - pr2$start + 1
pr2_seq_too_short <- pr2_seq_too_short %>% filter(sequence_length < start_end) %>% 
    filter(!(label %in% c("UC", "R"))) %>% select(pr2_accession, genbank_accession, 
    label, division, class, species, added_version, start, end, start_end, sequence_length, 
    sequence) %>% arrange(division, class, species)
  1. For all the sequences, a remark added in the pr2_main table
pr2_seq_too_short <- pr2_seq_too_short %>% mutate(query_main = paste0("UPDATE pr2_main SET ", 
    update_field_string("edited_version", pr2_version, TRUE), update_field_string("edited_by", 
        pr2_editor, TRUE), update_field_string("edited_remark", "length of sequence too short compared to end-start+1", 
        FALSE), " WHERE ", update_field_string("pr2_accession", pr2_accession, 
        FALSE)))
pr2_seq_too_short$query_main_result <- execute_query_vector(db_pr2, pr2_seq_too_short$query_main)

3.2 Sequences labelled as reference in Laure PR2 Original database

  1. Filter the original sequences
    Result : 23 194
pr2_original_reference <- pr2_original %>% filter(pr0_reference_sequence == 
    "Referenced sequence")
  1. Merge with pr2, emoving sequences that already labelled with reference_sequence = 1
    Result : 21 537
pr2_reference <- pr2 %>% filter(genbank_accession %in% pr2_original_reference$pr0_genbank_accession) %>% 
    filter(is.na(reference_sequence))
  1. Update database
pr2_reference <- pr2_reference %>% mutate(query_main = paste0("UPDATE pr2_main SET ", 
    update_field_string("edited_version", pr2_version, TRUE), update_field_string("edited_by", 
        "Guillou L.", TRUE), update_field_string("remark", "reference sequence in original PR2 database", 
        TRUE), update_field_string("reference_sequence", 1, FALSE), " WHERE ", 
    update_field_string("pr2_accession", pr2_accession, FALSE)))
pr2_reference$query_main_result <- execute_query_vector(db_pr2, pr2_reference$query_main)

3.3 Sequences labelled as chimera in Laure PR2 Original database

  1. Filter the original sequences
    Result : 1139
pr2_original_chimera <- pr2_original %>% filter(pr0_reference_sequence == "Chimera OK")
  1. Merge with pr2, removing sequences that already labelled with reference_sequence = 1
    Result : 7
pr2_chimera <- pr2 %>% filter(genbank_accession %in% pr2_original_chimera$pr0_genbank_accession) %>% 
    filter(is.na(reference_sequence))
  1. Update database
pr2_chimera <- pr2_chimera %>% mutate(query_main = paste0("UPDATE pr2_main SET ", 
    update_field_string("removed_version", pr2_version, TRUE), update_field_string("edited_by", 
        "Guillou L.", TRUE), update_field_string("chimera_remark", "chimera in original PR2 database", 
        TRUE), update_field_string("chimera", 1, FALSE), " WHERE ", update_field_string("pr2_accession", 
        pr2_accession, FALSE)))
pr2_chimera$query_main_result <- execute_query_vector(db_pr2, pr2_chimera$query_main)

3.4 Save the files of this version

source("C:/Users/vaulot/Google Drive/Scripts/R library/dv_function_pr2.R")
pr2_export_all()