PR2
Adding GenBank sequences

1 Instructions to add new Genbank sequences

  • Do a query using genbank_search query
  • Filter for sequences not present in PR2
  • Save as an rds file
  • Move rds to Roscoff server and run script called script_genbank.R using sbatch_genbank.sh
    • Get the metadata and sequences
    • Get the features
  • MOve metadata and features result file to local directory
  • Run rest of script to filter the data and save as an Excel file
  • If find features files then only get the rRNA part

2 Init

  source('PR2_init.R', echo=FALSE)
  source('PR2_read_google.R', echo=FALSE)

3 Set up the files

  dir_pr2_update <- "4.14/GenBank"
  
  pr2.env$editor <- "D. Vaulot"

  full_path <- function(file_name){str_c(dir_pr2_update,"/", file_name)}

# create the directory for taxonomy output
  dir.create(full_path(""), showWarnings = FALSE)

4 Part I - Get sequence list from GenBank and save as rds file

Skip this part if you have already the accession number list

4.2 Filter new sequences

pr2_new <- pr2_update_all %>% 
  filter(!(genbank_accession %in% pr2$genbank_accession)) 
  
pr2_genbank_new <- pr2_new %>% 
  select(genbank_accession)

4.3 Save the list of new sequences

  • Move this file to Roscoff server
saveRDS(pr2_genbank_new, full_path("pr2_genbank_new.rds"))

5 Part II - Run the script PR2 genbank download.R on Roscoff server

6 Part III - Read file with all Genbank info and save to Excel file for import

6.1 Read the metadata files and features files and join by genbank accession

pr2_new <- read_tsv(full_path("pr2_gb_metadata_2021-03-23.txt"), guess_max = 20000) 

pr2_features <- read_tsv(full_path("pr2_gb_features_2021-03-24.txt"), guess_max = 20000) %>% 
  select(-seqnames) %>% 
  filter(product %in% c("small subunit ribosomal RNA", "18S ribosomal RNA", "18S small subunit ribosomal RNA") |
          note  %in% c("small subunit ribosomal RNA", "18S ribosomal RNA", "18S small subunit ribosomal RNA")) %>% 
  distinct()

pr2_new <-  pr2_new %>% 
  left_join(pr2_features) %>% 
  mutate(gb_isolate=as.character(gb_isolate)) %>%  
  distinct() 

6.2 Add fields

 pr2_new <- pr2_new %>% 
  mutate(start=case_when(is.na(start) ~ 1, 
                         TRUE ~ start),
         end=case_when(is.na(end) ~sequence_length,
                       TRUE ~ end),
         label = "U",
         pr2_accession = str_c(genbank_accession,".",start,".", end,"_",label),
         reference_sequence = NA,
         gene = case_when( str_detect(gb_definition, "nucleomorph") ~ "18S_rRNA", 
                           str_detect(gb_definition, "chloroplast|mitochondr") ~ "16S_rRNA", 
                           TRUE ~ "18S_rRNA"),
         organelle = case_when(str_detect(gb_definition, "nucleomorph") ~ "nucleomorph", 
                               str_detect(gb_definition, "chloroplast") ~ "plastid",
                               str_detect(gb_definition, "mitochondr") ~ "mitochondrion",
                               TRUE ~ "nucleus"),
         sequence = str_sub(gb_sequence, start, end), 
         sequence_length = str_length(sequence),
         ambiguities = str_count(sequence, pattern=pr2.env$ambig_regex),
         # Next time remove from here because it is now in the script to run on the server
         pr2_sample_type = case_when(str_detect(gb_definition, "uncultured|Uncultured") ~ "environmental",
                                     str_detect(gb_isolation_source, "(?i)water|sediment|ice|plankton|blood|fecal|soil|lake|river|gulf") ~ "environmental",
                                     str_detect(gb_organism, "metagenom") ~ "environmental",
                                     !is.na(gb_specimen_voucher)~ "isolate",
                                     TRUE ~ pr2_sample_type)
         )

 pr2_new$sequence_hash = purrr::map_chr(pr2_new$sequence,digest::sha1)

6.3 Remove sequences with ambiguities or too short

pr2_remove <- pr2_new %>% 
  filter((sequence_length < pr2.env$sequence_length_min)|
         (sequence_length > pr2.env$sequence_length_max)|  
         (ambiguities > pr2.env$ambiguities_max)| 
          str_detect(sequence, pr2.env$sequence_N_repeat)) %>% 
  arrange(desc(sequence_length)) %>% 
  select(pr2_accession, sequence_length,ambiguities, gb_definition) 

  DT::datatable(pr2_remove, caption = "Sequence to remove - too short, too many ambiguities etc...")
pr2_new <- pr2_new %>% 
  filter(!(pr2_accession %in% pr2_remove$pr2_accession) )

6.3.1 Check duplicate entries

 pr2_new %>% 
  count(pr2_accession) %>% 
  filter(n > 1)
# A tibble: 0 x 2
# ... with 2 variables: pr2_accession <chr>, n <int>

6.4 Get some statistics

 ggplot(pr2_new, aes(x=sequence_length)) +
  geom_histogram() +
  xlim(0,3000)

6.4.1 Final files for uploading

 sprintf(" Final number of sequences added:  %d", nrow(pr2_new))
[1] " Final number of sequences added:  8765"
 colnames(pr2_new)
 [1] "genbank_accession"       "gb_id"                   "gb_project_id"           "gb_definition"           "gb_organism"             "gb_strain"               "gb_organelle"            "gb_isolate"             
 [9] "gb_clone"                "gb_specimen_voucher"     "gb_collected_by"         "gb_lat_lon"              "gb_note"                 "gb_culture_collection"   "gb_isolation_source"     "gb_host"                
[17] "gb_environmental_sample" "gb_collection_date"      "gb_country"              "gb_date"                 "pr2_longitude"           "pr2_latitude"            "pr2_sample_type"         "gb_sequence"            
[25] "sequence_length"         "start"                   "end"                     "width"                   "strand"                  "type"                    "note"                    "loctype"                
[33] "product"                 "estimated_length"        "label"                   "pr2_accession"           "reference_sequence"      "gene"                    "organelle"               "sequence"               
[41] "ambiguities"             "sequence_hash"          
 pr2_update_new <- pr2_new
 
  pr2_main_new <- pr2_update_new %>%  
    select (pr2_accession, genbank_accession, start,end, label, organelle, gene) %>% 
    mutate(added_version = pr2.env$version, edited_by = pr2.env$editor)
  nrow(pr2_main_new)
[1] 8765
  pr2_sequences_new <- pr2_update_new %>% 
    select(pr2_accession, sequence, sequence_length, ambiguities, sequence_hash)
  nrow(pr2_sequences_new)
[1] 8765
  pr2_metadata_new <- pr2_update_new %>% 
    select(genbank_accession, starts_with("gb_"),starts_with("pr2_"), starts_with("eukref_")) %>% 
    select( -pr2_accession) %>% 
    distinct()
  nrow(pr2_metadata_new)
[1] 8759

7 Save everything to an Excel file

 file_pr2_import <-  full_path("pr2_genbank_import_final.xlsx")
 onglets <- list("pr2_main_new" = pr2_main_new, 
                 "pr2_sequence_new" = pr2_sequences_new,
                 "pr2_metadata_new" = pr2_metadata_new)
 export(onglets, file_pr2_import, zoom = 90, firstRow = TRUE, firstCol = TRUE)