PR2
Adding GenBank sequences

1 Instructions to add new Genbank sequences

Do a query using genbank_search query
Filter for sequences not present in PR2
Save as an rds file
Move rds to Roscoff server and run script called script_genbank.R using sbatch_genbank.sh
- Get the metadata and sequences
- Get the features
MOve metadata and features result file to local directory
Run rest of script to filter the data and save as an Excel file
If find features files then only get the rRNA part

2 Init

  source('PR2_init.R', echo=FALSE)
  source('PR2_read_google.R', echo=FALSE)

3 Set up the files

  dir_pr2_update <- "4.14/GenBank"
  
  pr2.env$editor <- "D. Vaulot"

  full_path <- function(file_name){str_c(dir_pr2_update,"/", file_name)}

# create the directory for taxonomy output
  dir.create(full_path(""), showWarnings = FALSE)

4 Part I - Get sequence list from GenBank and save as rds file

Skip this part if you have already the accession number list

4.1 Genbank search

pr2_update <- list()

query_exclude = "NOT homo[TITL] NOT Streptophyta[ORGN] NOT Fungi[ORGN] NOT Metazoa[ORGN] NOT Bacteria[ORGN] NOT mitochondrion[TITL]"

query_include = "(18S ribosomal RNA gene[TITL] OR small subunit ribosomal RNA gene[TITL])"

query_date = "AND 2020/05/01:2021/03/23[Publication Date]"

pr2_update[[1]] <- genbank_search(query = str_c(query_include, query_exclude, query_date, sep=" "), seq_max = 15000)
pr2_update[[2]] <- genbank_search(query = "small subunit ribosomal RNA gene[TITL] AND Cafeteria[ORGN]", seq_max = 10000)
# pr2_update[[3]] <- genbank_search(query = "MN315515[ACCN]", seq_max = 10000)

pr2_update_all <- reduce(pr2_update, bind_rows)

# export(pr2_update_all, full_path("pr2_genbank_new_2021.xlsx"), 
#            zoom = 90, firstRow = TRUE, firstCol = TRUE)

4.2 Filter new sequences

pr2_new <- pr2_update_all %>% 
  filter(!(genbank_accession %in% pr2$genbank_accession)) 
  
pr2_genbank_new <- pr2_new %>% 
  select(genbank_accession)

4.3 Save the list of new sequences

Move this file to Roscoff server

saveRDS(pr2_genbank_new, full_path("pr2_genbank_new.rds"))

5 Part II - Run the script PR2 genbank download.R on Roscoff server

6 Part III - Read file with all Genbank info and save to Excel file for import

6.1 Read the metadata files and features files and join by genbank accession

pr2_new <- read_tsv(full_path("pr2_gb_metadata_2021-03-23.txt"), guess_max = 20000) 

pr2_features <- read_tsv(full_path("pr2_gb_features_2021-03-24.txt"), guess_max = 20000) %>% 
  select(-seqnames) %>% 
  filter(product %in% c("small subunit ribosomal RNA", "18S ribosomal RNA", "18S small subunit ribosomal RNA") |
          note  %in% c("small subunit ribosomal RNA", "18S ribosomal RNA", "18S small subunit ribosomal RNA")) %>% 
  distinct()

pr2_new <-  pr2_new %>% 
  left_join(pr2_features) %>% 
  mutate(gb_isolate=as.character(gb_isolate)) %>%  
  distinct()

6.2 Add fields

 pr2_new <- pr2_new %>% 
  mutate(start=case_when(is.na(start) ~ 1, 
                         TRUE ~ start),
         end=case_when(is.na(end) ~sequence_length,
                       TRUE ~ end),
         label = "U",
         pr2_accession = str_c(genbank_accession,".",start,".", end,"_",label),
         reference_sequence = NA,
         gene = case_when( str_detect(gb_definition, "nucleomorph") ~ "18S_rRNA", 
                           str_detect(gb_definition, "chloroplast|mitochondr") ~ "16S_rRNA", 
                           TRUE ~ "18S_rRNA"),
         organelle = case_when(str_detect(gb_definition, "nucleomorph") ~ "nucleomorph", 
                               str_detect(gb_definition, "chloroplast") ~ "plastid",
                               str_detect(gb_definition, "mitochondr") ~ "mitochondrion",
                               TRUE ~ "nucleus"),
         sequence = str_sub(gb_sequence, start, end), 
         sequence_length = str_length(sequence),
         ambiguities = str_count(sequence, pattern=pr2.env$ambig_regex),
         # Next time remove from here because it is now in the script to run on the server
         pr2_sample_type = case_when(str_detect(gb_definition, "uncultured|Uncultured") ~ "environmental",
                                     str_detect(gb_isolation_source, "(?i)water|sediment|ice|plankton|blood|fecal|soil|lake|river|gulf") ~ "environmental",
                                     str_detect(gb_organism, "metagenom") ~ "environmental",
                                     !is.na(gb_specimen_voucher)~ "isolate",
                                     TRUE ~ pr2_sample_type)
         )

 pr2_new$sequence_hash = purrr::map_chr(pr2_new$sequence,digest::sha1)

6.3 Remove sequences with ambiguities or too short

pr2_remove <- pr2_new %>% 
  filter((sequence_length < pr2.env$sequence_length_min)|
         (sequence_length > pr2.env$sequence_length_max)|  
         (ambiguities > pr2.env$ambiguities_max)| 
          str_detect(sequence, pr2.env$sequence_N_repeat)) %>% 
  arrange(desc(sequence_length)) %>% 
  select(pr2_accession, sequence_length,ambiguities, gb_definition) 

  DT::datatable(pr2_remove, caption = "Sequence to remove - too short, too many ambiguities etc...")

pr2_new <- pr2_new %>% 
  filter(!(pr2_accession %in% pr2_remove$pr2_accession) )

6.3.1 Check duplicate entries

 pr2_new %>% 
  count(pr2_accession) %>% 
  filter(n > 1)

# A tibble: 0 x 2
# ... with 2 variables: pr2_accession <chr>, n <int>

6.4 Get some statistics

 ggplot(pr2_new, aes(x=sequence_length)) +
  geom_histogram() +
  xlim(0,3000)

6.4.1 Final files for uploading

 sprintf(" Final number of sequences added:  %d", nrow(pr2_new))

[1] " Final number of sequences added:  8765"

 colnames(pr2_new)

 [1] "genbank_accession"       "gb_id"                   "gb_project_id"           "gb_definition"           "gb_organism"             "gb_strain"               "gb_organelle"            "gb_isolate"             
 [9] "gb_clone"                "gb_specimen_voucher"     "gb_collected_by"         "gb_lat_lon"              "gb_note"                 "gb_culture_collection"   "gb_isolation_source"     "gb_host"                
[17] "gb_environmental_sample" "gb_collection_date"      "gb_country"              "gb_date"                 "pr2_longitude"           "pr2_latitude"            "pr2_sample_type"         "gb_sequence"            
[25] "sequence_length"         "start"                   "end"                     "width"                   "strand"                  "type"                    "note"                    "loctype"                
[33] "product"                 "estimated_length"        "label"                   "pr2_accession"           "reference_sequence"      "gene"                    "organelle"               "sequence"               
[41] "ambiguities"             "sequence_hash"

 pr2_update_new <- pr2_new
 
  pr2_main_new <- pr2_update_new %>%  
    select (pr2_accession, genbank_accession, start,end, label, organelle, gene) %>% 
    mutate(added_version = pr2.env$version, edited_by = pr2.env$editor)
  nrow(pr2_main_new)

[1] 8765

  pr2_sequences_new <- pr2_update_new %>% 
    select(pr2_accession, sequence, sequence_length, ambiguities, sequence_hash)
  nrow(pr2_sequences_new)

[1] 8765

  pr2_metadata_new <- pr2_update_new %>% 
    select(genbank_accession, starts_with("gb_"),starts_with("pr2_"), starts_with("eukref_")) %>% 
    select( -pr2_accession) %>% 
    distinct()
  nrow(pr2_metadata_new)

[1] 8759

7 Save everything to an Excel file

 file_pr2_import <-  full_path("pr2_genbank_import_final.xlsx")
 onglets <- list("pr2_main_new" = pr2_main_new, 
                 "pr2_sequence_new" = pr2_sequences_new,
                 "pr2_metadata_new" = pr2_metadata_new)
 export(onglets, file_pr2_import, zoom = 90, firstRow = TRUE, firstCol = TRUE)

PR2 Adding GenBank sequences

PR2 Adding GenBank sequences

1 Instructions to add new Genbank sequences

2 Init

3 Set up the files

4 Part I - Get sequence list from GenBank and save as rds file

4.1 Genbank search

4.2 Filter new sequences

4.3 Save the list of new sequences

5 Part II - Run the script PR2 genbank download.R on Roscoff server

6 Part III - Read file with all Genbank info and save to Excel file for import

6.1 Read the metadata files and features files and join by genbank accession

6.2 Add fields

6.3 Remove sequences with ambiguities or too short

6.3.1 Check duplicate entries

6.4 Get some statistics

6.4.1 Final files for uploading

7 Save everything to an Excel file

PR2
Adding GenBank sequences

PR2
Adding GenBank sequences