PR2 version 4.13.0
Adding GenBank sequences

1 Init
2 Read pr2
3 Set up the files
4 Get sequence list from GenBank
- 4.1 Compare with sequences in PR2
5 New sequences
6 Finalization
7 Save everything to an Excel file
8 Get metadata for which no metadata are available

1 Init

Load the variables common to the different scripts and the necessary libraries

  knitr::opts_chunk$set(eval=TRUE, 
                        cache = TRUE, 
                        # cache.extra = file.info("../updates/2019 Arsenieff diatoms/pr2_diatoms_2019_11_21_LA.xlsx"),
                        tidy = FALSE)

  source('PR2_init.R', echo=FALSE)

2 Read pr2

Use the PR2 from Google database

# Info for the local dabase 
  pr2_db <- db_info("pr2_google")

  pr2_db_con <- db_connect(pr2_db)
  pr2_main <- tbl(pr2_db_con, "pr2_main") %>% collect()
  # pr2_main  <- pr2_main %>% filter (is.na(removed_version))

  # pr2_seq <- tbl(pr2_db_con, "pr2_sequences")%>% collect()

  pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy")
  pr2_taxo  <- pr2_taxo %>% filter (is.na(taxo_removed_version))%>% collect()

  pr2_metadata <- tbl(pr2_db_con, "pr2_metadata")%>% collect()

# Join the tables and keep only sequences that are not removed

  pr2 <- left_join(pr2_main, pr2_taxo, by = c("species"="species"))
  # pr2 <- left_join (pr2, pr2_seq)
  # pr2 <- left_join (pr2, pr2_metadata)
  db_disconnect(pr2_db_con)

## [1] TRUE

  pr2_18S  <- pr2 %>% filter (is.na(removed_version)) %>% 
    filter(gene == "18S_rRNA")
  pr2_18S_removed <- pr2 %>% filter (!is.na(removed_version)) %>% 
    filter(gene == "18S_rRNA")
  
# Remove pr2 to make sure that we do not use the full pr2
  # remove(pr2)

3 Set up the files

  dir_pr2_update <- "../updates/2020 4.13.0 GenBank"
  
  pr2.env$editor <- "D. Vaulot"

  full_path <- function(file_name){str_c(dir_pr2_update,"/", file_name)}

  # file_fasta <- full_path("arb-silva.de_2020-05-18_id824681_tax_silva samples.fasta")
  file_fasta <- full_path("arb-silva.de_2020-05-18_id824681_tax_silva.fasta.gz")
  
# create the directory for taxonomy output
  dir.create(full_path("taxo"), showWarnings = FALSE)

4 Get sequence list from GenBank

  pr2_update <- genbank_search(query = "18S ribosomal RNA gene[TITL] NOT internal transcribed spacer[TITL] NOT homo[TITL] AND 2018/11/16:2018/12/31[Publication Date]", seq_max = 10000)

write.xlsx(pr2_update, full_path("pr2_genbank_new_2018.xlsx"), 
           zoom = 90, firstRow = TRUE, firstCol = TRUE)

pr2_update <- genbank_search(query = "18S ribosomal RNA gene[TITL] NOT internal transcribed spacer[TITL] NOT homo[TITL] AND 2019[Publication Date]", seq_max = 10000)

write.xlsx(pr2_update, full_path("pr2_genbank_new_2019.xlsx"), 
           zoom = 90, firstRow = TRUE, firstCol = TRUE)
  
pr2_update <- genbank_search(query = "18S ribosomal RNA gene[TITL] NOT internal transcribed spacer[TITL] NOT homo[TITL] AND 2020[Publication Date]", seq_max = 10000)

write.xlsx(pr2_update, full_path("pr2_genbank_new_2020.xlsx"), 
           zoom = 90, firstRow = TRUE, firstCol = TRUE)

pr2_update_2018 <- read_xlsx(full_path("pr2_genbank_new_2018.xlsx"))
pr2_update_2019 <- read_xlsx(full_path("pr2_genbank_new_2019.xlsx"))
pr2_update_2020 <- read_xlsx(full_path("pr2_genbank_new_2020.xlsx"))

pr2_update <- bind_rows(pr2_update_2018, pr2_update_2019, pr2_update_2020)

4.1 Compare with sequences in PR2

  str_c("Number of updated sequences that are not present in PR2 : ", 
        nrow(filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))))

## [1] "Number of updated sequences that are not present in PR2 : 12504"

  str_c("Number of updated sequences that are not active in PR2 : ", 
        nrow(filter(pr2_update, (genbank_accession %in% pr2_18S_removed$genbank_accession))))

## [1] "Number of updated sequences that are not active in PR2 : 6"

   str_c("Sequences duplicated  - df ")

## [1] "Sequences duplicated  - df "

   pr2_update %>% 
     count(genbank_accession) %>% 
     filter(n > 1)

## # A tibble: 0 x 2
## # ... with 2 variables: genbank_accession <chr>, n <int>

5 New sequences

5.1 Filter new sequences

pr2_new <- pr2_update %>% 
  filter(!(genbank_accession %in% pr2$genbank_accession)) 
  
pr2_genbank_new <- pr2_new %>% 
  select(genbank_accession)

5.2 Save the list of new sequences

saveRDS(pr2_genbank_new, full_path("pr2_genbank_new_3.rds"))

5.3 Run the script PR2 genbank download.R in background

5.4 Read the metadata files and join by genbank accession

metafile <- list()

for (i in c(1:10)) {
  metafile[[i]] <- read_tsv(full_path(str_c("metadata/pr2_gb_metadata_3_",i,".txt")), 
                            col_type = cols(gb_collection_date = col_character())) %>% 
                    mutate(gb_isolate=as.character(gb_isolate))
}
# The first file serves as a template for the following one when binding rows
pr2_new <- reduce(metafile, bind_rows) %>%  
  distinct() %>% 
  filter(!str_detect(gb_definition, "transcribed spacer")) %>% 
  filter(sequence_length >= pr2.env$sequence_length_min & sequence_length <= 3000)

write.xlsx(select(pr2_new, -gb_sequence) , full_path("pr2_new.xlsx"), 
           zoom = 90, firstRow = TRUE, firstCol = TRUE)

6 Finalization

6.1 Read back the taxo corrected

# pr2_taxo_updated <- read_excel(full_path("taxo_to_add_corrected.xlsx"), guess_max=200000, na=c("", "-"))
# pr2_taxo_remove <- read_excel(file_pr2_update_excel, sheet = "taxo_to_remove", guess_max=200000, na=c("", "-"))

6.2 New sequences

6.2.1 Read new sequences once the corrections have been done

Sequences with ITS region removed
Sequences with introns removed

# pr2_new <- read_excel(full_path("pr2_new_corrected.xlsx"), na="") %>% 
#   filter(!is.na(gb_definition))

6.2.2 Add fields

 pr2_new <- pr2_new %>% 
  mutate(start=1, end=sequence_length,
         label = "U",
         pr2_accession = str_c(genbank_accession,".",start,".", end,"_",label),
         reference_sequence = NA,
         gene=case_when(str_detect(gb_definition, "nucleomorph") ~ "18S_rRNA", 
                               str_detect(gb_definition, "chloroplast") ~ "16S_rRNA", 
                               TRUE ~ "18S_rRNA"),
         organelle = case_when(str_detect(gb_definition, "nucleomorph") ~ "nucleomorph", 
                               str_detect(gb_definition, "chloroplast") ~ "plastid", 
                               TRUE ~ "nucleus"),
         ambiguities = str_count(gb_sequence, pattern=pr2.env$ambig_regex),
         pr2_sample_type = case_when(str_detect(gb_definition, "uncultured|Uncultured") ~ "environmental",
                                     TRUE ~ pr2_sample_type)
         )

 pr2_new$sequence_hash = purrr::map_chr(pr2_new$gb_sequence,digest::sha1)

6.2.3 Remove sequences with ambiguities or too short

filter(pr2_new, (sequence_length < pr2.env$sequence_length_min)|(ambiguities > pr2.env$ambiguities_max)) %>% 
  arrange(desc(sequence_length)) %>% 
  select(pr2_accession, sequence_length,ambiguities, gb_definition)

## # A tibble: 27 x 4
##    pr2_accession   sequence_length ambiguities gb_definition                    
##    <chr>                     <dbl>       <int> <chr>                            
##  1 MG972345.1.287~            2875         103 Chaetoceros sp. Na12A3 isolate N~
##  2 MG972343.1.287~            2875         103 Chaetoceros sp. Na12A3 18S ribos~
##  3 MG972335.1.268~            2689         131 Chaetoceros diversus isolate Na3~
##  4 MG972337.1.262~            2621         130 Chaetoceros diversus isolate Na1~
##  5 MG972364.1.245~            2455          87 Chaetoceros sp. Na28A1 18S ribos~
##  6 KL780694.1.243~            2432         100 Manihot esculenta cultivar KU50 ~
##  7 MF497377.1.235~            2356         830 Phryganella paradoxa isolate KD1~
##  8 MG972365.1.220~            2204         120 Chaetoceros cf. vixvisibilis iso~
##  9 MN367945.1.211~            2115         140 Bembidion platyderoides voucher ~
## 10 MG972366.1.209~            2097          74 Chaetoceros cf. vixvisibilis iso~
## # ... with 17 more rows

pr2_new<- pr2_new %>% 
  filter((sequence_length >= pr2.env$sequence_length_min) & (ambiguities <= pr2.env$ambiguities_max))

6.2.4 Check duplicate entries

 pr2_new %>% 
  count(pr2_accession) %>% 
  filter(n > 1)

## # A tibble: 0 x 2
## # ... with 2 variables: pr2_accession <chr>, n <int>

6.3 Get some statistics

 ggplot(pr2_new, aes(x=sequence_length)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

6.3.1 Final files for uploading

FInal number of sequences: 7032

 sprintf(" Final number of sequences added:  %d", nrow(pr2_new))

## [1] " Final number of sequences added:  7032"

 colnames(pr2_new)

##  [1] "genbank_accession"       "gb_definition"          
##  [3] "gb_organism"             "gb_strain"              
##  [5] "gb_organelle"            "gb_isolate"             
##  [7] "gb_clone"                "gb_specimen_voucher"    
##  [9] "gb_collected_by"         "gb_lat_lon"             
## [11] "gb_note"                 "gb_culture_collection"  
## [13] "gb_isolation_source"     "gb_host"                
## [15] "gb_environmental_sample" "gb_collection_date"     
## [17] "gb_country"              "gb_date"                
## [19] "pr2_longitude"           "pr2_latitude"           
## [21] "pr2_sample_type"         "gb_sequence"            
## [23] "sequence_length"         "start"                  
## [25] "end"                     "label"                  
## [27] "pr2_accession"           "reference_sequence"     
## [29] "gene"                    "organelle"              
## [31] "ambiguities"             "sequence_hash"

 pr2_update_new <- pr2_new
 
  pr2_main_new <- pr2_update_new %>%  
    select (pr2_accession, genbank_accession, start,end, label, organelle, gene) %>% 
    mutate(added_version = pr2.env$version, edited_by = pr2.env$editor)
  nrow(pr2_main_new)

## [1] 7032

  pr2_sequences_new <- pr2_update_new %>% 
    select(pr2_accession, gb_sequence, sequence_length, ambiguities, sequence_hash)
  nrow(pr2_sequences_new)

## [1] 7032

  pr2_metadata_new <- pr2_update_new %>% 
    select(genbank_accession, starts_with("gb_"),starts_with("pr2_"), starts_with("eukref_")) %>% 
    select( -pr2_accession, -gb_sequence) %>% 
    distinct()
  nrow(pr2_metadata_new)

## [1] 7032

7 Save everything to an Excel file

 file_pr2_imports <-  full_path("pr2_imports_final.xlsx")
 onglets <- list(#"pr2_taxo_updated" = pr2_taxo_updated,
                 # "pr2_taxo_remove" = pr2_taxo_remove,
                 #"pr2_main_old" = pr2_main_old, 
                 # "pr2_metadata_old" = pr2_metadata_old,
                 "pr2_main_new" = pr2_main_new, 
                 "pr2_sequence_new" = pr2_sequences_new,
                 "pr2_metadata_new" = pr2_metadata_new)
 write.xlsx(onglets, file_pr2_imports, zoom = 90, firstRow = TRUE, firstCol = TRUE)

8 Get metadata for which no metadata are available

8.1 Find sequences for which no metadata

pr2_metadata_missing <- pr2 %>% 
  filter(!(genbank_accession %in% pr2_metadata$genbank_accession))

pr2_genbank_new <- pr2_metadata_missing %>% 
  select(genbank_accession) %>% 
  filter(!is.na(genbank_accession)) %>% 
  filter(!str_detect(genbank_accession, "^10-|^Par"))

8.2 Save the list of new sequences

saveRDS(pr2_genbank_new, full_path("pr2_genbank_new_4.rds"))

8.3 Run the script PR2 genbank download.R in background

8.4 Read the metadata files and join by genbank accession

metafile <- list()

for (i in c(1:5)) {
  metafile[[i]] <- read_tsv(full_path(str_c("metadata/pr2_gb_metadata_4_",i,".txt")), 
                            col_type = cols(gb_collection_date = col_character())) %>% 
                    mutate(gb_isolate=as.character(gb_isolate))
}
# The first file serves as a template for the following one when binding rows
pr2_metadata_old <- reduce(metafile, bind_rows) %>%  
  distinct() 

pr2_metadata_old %>% count(genbank_accession) %>%  filter (n>1)

## # A tibble: 0 x 2
## # ... with 2 variables: genbank_accession <chr>, n <int>

8.5 Save everything to an Excel file

 file_pr2_imports <-  full_path("pr2_imports_final_4.xlsx")
 onglets <- list("pr2_metadata_old" = pr2_metadata_old)
 write.xlsx(onglets, file_pr2_imports, zoom = 90, firstRow = TRUE, firstCol = TRUE)

PR2 version 4.13.0 Adding GenBank sequences

PR2 version 4.13.0 Adding GenBank sequences

1 Init

2 Read pr2

3 Set up the files

4 Get sequence list from GenBank

4.1 Compare with sequences in PR2

5 New sequences

5.1 Filter new sequences

5.2 Save the list of new sequences

5.3 Run the script PR2 genbank download.R in background

5.4 Read the metadata files and join by genbank accession

6 Finalization

6.1 Read back the taxo corrected

6.2 New sequences

6.2.1 Read new sequences once the corrections have been done

6.2.2 Add fields

6.2.3 Remove sequences with ambiguities or too short

6.2.4 Check duplicate entries

6.3 Get some statistics

6.3.1 Final files for uploading

7 Save everything to an Excel file

8 Get metadata for which no metadata are available

8.1 Find sequences for which no metadata

8.2 Save the list of new sequences

8.3 Run the script PR2 genbank download.R in background

8.4 Read the metadata files and join by genbank accession

8.5 Save everything to an Excel file

PR2 version 4.13.0
Adding GenBank sequences

PR2 version 4.13.0
Adding GenBank sequences