PR2 version 4.13.0
Adding RCC sequences

1 Init
2 Read pr2
3 Set up the files
4 RCC database
5 Taxonomy
- 5.1 Build and check
- 5.2 For taxa not yet in PR2 find if genus is present
6 New sequences
7 Finalization
- 7.1 Read back the taxo corrected
- 7.2 New sequences
8 Save everything to an Excel file

1 Init

Load the variables common to the different scripts and the necessary libraries

  knitr::opts_chunk$set(eval=TRUE, 
                        cache = TRUE, 
                        # cache.extra = file.info("../updates/2019 Arsenieff diatoms/pr2_diatoms_2019_11_21_LA.xlsx"),
                        tidy = FALSE)

  source('PR2_init.R', echo=FALSE)

2 Read pr2

Use the PR2 from Google database

# Info for the local dabase 
  pr2_db <- db_info("pr2_google")

  pr2_db_con <- db_connect(pr2_db)
  pr2_main <- tbl(pr2_db_con, "pr2_main") %>% collect()
  # pr2_main  <- pr2_main %>% filter (is.na(removed_version))

  # pr2_seq <- tbl(pr2_db_con, "pr2_sequences")%>% collect()

  pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy")
  pr2_taxo  <- pr2_taxo %>% filter (is.na(taxo_removed_version))%>% collect()

  # pr2_metadata <- tbl(pr2_db_con, "pr2_metadata")%>% collect()

# Join the tables and keep only sequences that are not removed

  pr2 <- left_join(pr2_main, pr2_taxo, by = c("species"="species"))
  # pr2 <- left_join (pr2, pr2_seq)
  # pr2 <- left_join (pr2, pr2_metadata)
  db_disconnect(pr2_db_con)

## [1] TRUE

  pr2_18S  <- pr2 %>% filter (is.na(removed_version)) %>% 
    filter(gene == "18S_rRNA")
  pr2_18S_removed <- pr2 %>% filter (!is.na(removed_version)) %>% 
    filter(gene == "18S_rRNA")
  
# Remove pr2 to make sure that we do not use the full pr2
  # remove(pr2)

3 Set up the files

  dir_pr2_update <- "../updates/2020 4.13.0 RCC"
  
  pr2.env$editor <- "D. Vaulot"

  full_path <- function(file_name){str_c(dir_pr2_update,"/", file_name)}

  file_pr2_update_excel <- full_path("pr2_rcc.xlsx")  

# create the directory for taxonomy output
  dir.create(full_path("taxo"), showWarnings = FALSE)

4 RCC database

4.1 Read RCC

  db_rcc <- db_connect(db_info("rcc_local") )

  cultures <- tbl(db_rcc, "cultures") %>% 
    collect()
  taxonomy <- tbl(db_rcc, "taxonomy") %>% 
    collect()
  cultures <- left_join(cultures, taxonomy) 
  # cultures <- cultures %>% filter(lost==0)
  cultures <- cultures %>% mutate(year_entered = lubridate::year(date_entered_catalog),
                    year_sampled = lubridate::year(lubridate::parse_date_time(sampling_date, "ymd")),
                    year_lost = lubridate::year(lost_date),
                    latitude  = mapply(lat_long_dec,sampling_lat_deg,sampling_lat_mn,sampling_lat_ns,"latitude"),
                    longitude = mapply(lat_long_dec,sampling_long_deg,sampling_long_mn,sampling_long_ew,"longitude"))
  

  sequences <- tbl(db_rcc, "sequences") %>% collect()
  
  db_disconnect(db_rcc)
  
# Build some tables -------------------------------------------------------
  
  sequences_18S <- sequences %>% 
    filter(str_detect(gene_name, "18S")) %>% 
    select(rcc_id, genbank_accession, sequence) %>% 
    rename(accession_18S = genbank_accession, sequence_18S = sequence) %>% 
    mutate(sequence_18S_length = str_length(sequence_18S))
  
  sequences_18S_longer <- sequences_18S %>% 
    arrange(rcc_id, desc(sequence_18S_length)) %>% 
    distinct(rcc_id, .keep_all = TRUE)

4.2 Construct and export tables

  pr2_update <- cultures %>%
    rename(kingdom=domain) %>% 
    arrange(kingdom, division, class, order, family, genus, species, rcc_id) %>%
    select(kingdom, division, class, order, family, genus, species, rcc_id, lost_date,
           strain_name, strain_name_synonyms,
           sampling_ocean, sampling_regional_sea , sampling_depth,
           sampling_date, latitude, longitude) %>% 
    left_join(sequences_18S_longer) %>% 
    rename(genbank_accession = accession_18S) %>% 
    filter(!(genbank_accession %in% pr2$genbank_accession ) & !is.na(genbank_accession)) %>% 
    # left_join(sequences_ITS) %>% 
    select_if(~!all(is.na(.)))   %>%                                 # Remove all empty columns
    filter(sequence_18S_length >= pr2.env$sequence_length_min) %>%   # Remove sequences that are too short
    mutate(species = str_replace(species, "_sp", "_sp."))  %>%       # Replace sp by sp.
    mutate(species = str_replace(species, "_cf.+", "_sp.")) %>%      # Replace cf_xxx by sp.
    filter(!str_detect(species, "Eukaryota_XXXXX"))                  # Remove unassigned sequences
    


  write.xlsx(pr2_update, full_path("pr2_rcc.xlsx"), zoom = 90, firstActiveRow = 2, firstActiveCol = 9, colWidths = "auto")

4.3 Reload the file after manual correction

  pr2_update <- read.xlsx(full_path("pr2_rcc_corrected.xlsx"))

4.4 Compare with sequences in PR2

  str_c("Number of updated sequences that are not present in PR2 : ", 
        nrow(filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))))

## [1] "Number of updated sequences that are not present in PR2 : 1244"

  str_c("Number of updated sequences that are not active in PR2 : ", 
        nrow(filter(pr2_update, (genbank_accession %in% pr2_18S_removed$genbank_accession))))

## [1] "Number of updated sequences that are not active in PR2 : 0"

   str_c("Sequences duplicated  - df ")

## [1] "Sequences duplicated  - df "

   pr2_update %>% 
     count(genbank_accession) %>% 
     filter(n > 1)

## [1] genbank_accession n                
## <0 rows> (or 0-length row.names)

5 Taxonomy

5.1 Build and check

# Only 7 levels in RCC
taxo_levels <- c(pr2.env$taxo_levels[1], pr2.env$taxo_levels[3:8])

pr2_taxo_updated <- pr2_update  %>% 
  group_by_at(taxo_levels) %>%  
  count() %>% 
  ungroup()

pr2_taxo_check(pr2_taxo_updated, taxo_levels = taxo_levels, full_path("taxo"))

## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

## Warning: `transmute_()` is deprecated as of dplyr 0.7.0.
## Please use `transmute()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

## Joining, by = "name"

pr2_taxo_updated <- pr2_taxo_updated %>% 
  rename_all(~ str_c(.,"_new" )) %>%
  rename(species = species_new) %>% 
  left_join(pr2_taxo) %>% 
  rename_at(pr2.env$taxo_levels, ~ str_c(.,"_old" )) %>% 
  rename_all( ~ str_replace(.,"_new", "" )) %>% 
  rename(species = species_old)

## Joining, by = "species"

write.xlsx(pr2_taxo_updated, full_path("taxo_rcc.xlsx"), zoom = 90, firstRow = TRUE, colWidths = "auto")

5.2 For taxa not yet in PR2 find if genus is present

pr2_taxo_to_add <- pr2_taxo_updated %>% 
  filter(is.na(taxo_id)) %>% 
  select(genus, species, n, taxo_edited_version, taxo_edited_by) %>% 
  left_join(select(pr2_taxo, kingdom:genus)) %>% 
  distinct()

## Joining, by = "genus"

write.xlsx(pr2_taxo_to_add , full_path("taxo_to_add.xlsx"), zoom = 90, firstRow = TRUE, colWidths = "auto")

6 New sequences

6.1 List of new sequences

pr2_new <- pr2_update %>% 
  filter(!(genbank_accession %in% pr2$genbank_accession)) %>% 
  select(genbank_accession:species)

pr2_genbank_new <- pr2_new %>% 
  select(genbank_accession)

6.2 Run the script PR2 genbank download.R in background

6.3 Read the metadata files and join by genbank accession

metafile <- list()

for (i in c(1:10)) {
  metafile[[i]] <- read_tsv(full_path(str_c("pr2_gb_metadata_",i,".txt")), 
                            col_type = cols(gb_collection_date = col_character())) %>% 
                    mutate(gb_isolate=as.character(gb_isolate))
}
# The first file serves as a template for the following one when binding rows
gb_metadata <- reduce(metafile, bind_rows) %>%  
  distinct() %>% 
  rename(gb_sequence = sequence)

pr2_new <- left_join(pr2_new, gb_metadata, by = c("genbank_accession" = "genbank_accession"))

6.4 Read the features file and join

gb_features <- read_tsv(full_path("pr2_gb_features.txt")) %>% 
  filter(product == "18S ribosomal RNA") %>% 
  rename(start_feature = start, end_feature = end) %>% 
  distinct()

## Parsed with column specification:
## cols(
##   seqnames = col_character(),
##   start = col_double(),
##   end = col_double(),
##   width = col_double(),
##   strand = col_character(),
##   type = col_character(),
##   product = col_character(),
##   loctype = col_character(),
##   genbank_accession = col_character(),
##   note = col_character()
## )

pr2_new <- left_join(pr2_new, gb_features, by = c("genbank_accession" = "genbank_accession"))

write.xlsx(pr2_new, full_path("pr2_new.xlsx"), na="")

7 Finalization

7.1 Read back the taxo corrected

pr2_taxo_updated <- read_excel(full_path("taxo_to_add_corrected.xlsx"), guess_max=200000, na=c("", "-"))
# pr2_taxo_remove <- read_excel(file_pr2_update_excel, sheet = "taxo_to_remove", guess_max=200000, na=c("", "-"))

7.2 New sequences

7.2.1 Read new sequences once the corrections have been done

Sequences with ITS region removed
Sequences with introns removed

pr2_new <- read_excel(full_path("pr2_new_corrected.xlsx"), na="") %>% 
  filter(!is.na(gb_definition))

7.2.2 Add fields

 pr2_new <- pr2_new %>% 
  # mutate(start=1, end=sequence_length,
  #        label = case_when (intron == "intron" ~ "G",
  #                           TRUE ~"U")) %>% 
  mutate(sequence=str_sub(gb_sequence,start,end),
         sequence_length = str_length(sequence),
         label = "U",
         pr2_accession = str_c(genbank_accession,".",start,".", end,"_",label),
         reference_sequence = NA,
         gene=case_when(str_detect(gb_definition, "nucleomorph") ~ "18S_rRNA", 
                               str_detect(gb_definition, "chloroplast") ~ "16S_rRNA", 
                               TRUE ~ "18S_rRNA"),
         organelle = case_when(str_detect(gb_definition, "nucleomorph") ~ "nucleomorph", 
                               str_detect(gb_definition, "chloroplast") ~ "plastid", 
                               TRUE ~ "nucleus"),
         ambiguities = str_count(sequence,pattern=pr2.env$ambig_regex)
         )

 pr2_new$sequence_hash = purrr::map_chr(pr2_new$sequence,digest::sha1)

7.2.3 Remove sequences with ambiguities or too short

filter(pr2_new, (sequence_length < pr2.env$sequence_length_min)|(ambiguities > pr2.env$ambiguities_max)) %>% 
  arrange(desc(sequence_length)) %>% 
  select(pr2_accession, species, sequence_length,ambiguities, species,gb_definition)

## # A tibble: 1 x 5
##   pr2_accession  species   sequence_length ambiguities gb_definition            
##   <chr>          <chr>               <int>       <int> <chr>                    
## 1 MF179484.1.18~ Hemiselm~            1847          34 Hemiselmis virescens vou~

pr2_new<- pr2_new %>% 
  filter((sequence_length >= pr2.env$sequence_length_min) & (ambiguities <= pr2.env$ambiguities_max))

7.2.4 Check duplicate entries

 pr2_new %>% 
  count(pr2_accession) %>% 
  filter(n > 1)

## # A tibble: 0 x 2
## # ... with 2 variables: pr2_accession <chr>, n <int>

7.2.5 Final files for uploading

FInal number of sequences: 1129

 sprintf(" Final number of sequences added:  %d", nrow(pr2_new))

## [1] " Final number of sequences added:  1129"

 colnames(pr2_new)

##  [1] "genbank_accession"       "pr2_longitude"          
##  [3] "pr2_latitude"            "sampling_date"          
##  [5] "pr2_depth"               "pr2_sea"                
##  [7] "pr2_ocean"               "strain_name"            
##  [9] "rcc_id"                  "species"                
## [11] "gb_definition"           "gb_organism"            
## [13] "gb_organelle"            "gb_strain"              
## [15] "gb_isolate"              "gb_clone"               
## [17] "gb_specimen_voucher"     "gb_collected_by"        
## [19] "gb_lat_lon"              "gb_note"                
## [21] "gb_culture_collection"   "gb_isolation_source"    
## [23] "gb_host"                 "gb_environmental_sample"
## [25] "gb_collection_date"      "gb_country"             
## [27] "gb_date"                 "gb_locus"               
## [29] "gb_sequence"             "pr2_sample_type"        
## [31] "sequence_length"         "seqnames"               
## [33] "start"                   "end"                    
## [35] "start_feature"           "end_feature"            
## [37] "width"                   "strand"                 
## [39] "type"                    "product"                
## [41] "loctype"                 "note"                   
## [43] "sequence"                "label"                  
## [45] "pr2_accession"           "reference_sequence"     
## [47] "gene"                    "organelle"              
## [49] "ambiguities"             "sequence_hash"

 pr2_update_new <- pr2_new
 
  pr2_main_new <- pr2_update_new %>%  
    select (pr2_accession, genbank_accession, start,end, label, species, organelle, gene) %>% 
    mutate(added_version = pr2.env$version, edited_by = pr2.env$editor)
  nrow(pr2_main_new)

## [1] 1129

  pr2_sequences_new <- pr2_update_new %>% 
    select(pr2_accession, sequence, gb_sequence, sequence_length, ambiguities, sequence_hash)
  nrow(pr2_sequences_new)

## [1] 1129

  pr2_metadata_new <- pr2_update_new %>% 
    select(genbank_accession, starts_with("gb_"),starts_with("pr2_"), starts_with("eukref_")) %>% 
    select( -pr2_accession, -gb_sequence) %>% 
    distinct()
  nrow(pr2_metadata_new)

## [1] 1129

8 Save everything to an Excel file

 file_pr2_imports <-  full_path("pr2_imports_final.xlsx")
 onglets <- list("pr2_taxo_updated" = pr2_taxo_updated,
                 # "pr2_taxo_remove" = pr2_taxo_remove,
                 #"pr2_main_old" = pr2_main_old, 
                 # "pr2_metadata_old" = pr2_metadata_old,
                 "pr2_main_new" = pr2_main_new, 
                 "pr2_sequence_new" = pr2_sequences_new,
                 "pr2_metadata_new" = pr2_metadata_new)
 write.xlsx(onglets, file_pr2_imports, zoom = 90, firstRow = TRUE, firstCol = TRUE)

## Warning in file.create(to[okay]): impossible de créer le fichier '../updates/
## 2020 4.13.0 RCC/pr2_imports_final.xlsx', à cause de 'Permission denied'

PR2 version 4.13.0 Adding RCC sequences

PR2 version 4.13.0 Adding RCC sequences

1 Init

2 Read pr2

3 Set up the files

4 RCC database

4.1 Read RCC

4.2 Construct and export tables

4.3 Reload the file after manual correction

4.4 Compare with sequences in PR2

5 Taxonomy

5.1 Build and check

5.2 For taxa not yet in PR2 find if genus is present

6 New sequences

6.1 List of new sequences

6.2 Run the script PR2 genbank download.R in background

6.3 Read the metadata files and join by genbank accession

6.4 Read the features file and join

7 Finalization

7.1 Read back the taxo corrected

7.2 New sequences

7.2.1 Read new sequences once the corrections have been done

7.2.2 Add fields

7.2.3 Remove sequences with ambiguities or too short

7.2.4 Check duplicate entries

7.2.5 Final files for uploading

8 Save everything to an Excel file

PR2 version 4.13.0
Adding RCC sequences

PR2 version 4.13.0
Adding RCC sequences