PR2 version 4.13.0
Adding Silva sequences

1 Init
2 Read pr2
3 Set up the files
4 Read fasta file
5 Old sequences
6 New sequences
7 Finalization
8 One more round of search for metadata
9 Get taxonomy of Silva and Genbank sequences from hash value
- 9.1 First iteration
- 9.2 Second iteration
10 Use dada2 AssignTaxonomy to assing the taxonomy of the Silva sequences

1 Init

Load the variables common to the different scripts and the necessary libraries

  knitr::opts_chunk$set(eval=FALSE, 
                        cache = TRUE, 
                        # cache.extra = file.info("../updates/2019 Arsenieff diatoms/pr2_diatoms_2019_11_21_LA.xlsx"),
                        tidy = FALSE)

  source('PR2_init.R', echo=FALSE)

2 Read pr2

Use the PR2 from Google database

# Info for the Google dabase 
  pr2_db <- db_info("pr2_google")

  pr2_db_con <- db_connect(pr2_db)
  
  pr2_main <- tbl(pr2_db_con, "pr2_main") %>% 
    collect()

  # pr2_seq <- tbl(pr2_db_con, "pr2_sequences") %>% 
  #   collect()

  pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy") %>% 
    filter (is.na(taxo_removed_version)) %>% 
    collect()

  pr2_metadata <- tbl(pr2_db_con, "pr2_metadata") %>% 
    collect()

# Join the tables and keep only sequences that are not removed

  pr2 <- pr2_main %>% 
  left_join(pr2_taxo, by = c("species"="species")) %>% 
  # left_join (pr2_seq) %>% 
  left_join (pr2_metadata)
  
  db_disconnect(pr2_db_con)
  
  pr2_18S  <- pr2 %>% 
    filter (is.na(removed_version)) %>% 
    filter(gene == "18S_rRNA") %>% 
    filter(!is.na(species)) %>% 
    filter(!is.na(kingdom))
  
  pr2_18S_no_taxo  <- pr2 %>%  
    filter(gene == "18S_rRNA") %>% 
    filter(is.na(species))
  
  pr2_18S_removed <- pr2 %>% filter (!is.na(removed_version)) %>% 
    filter(gene == "18S_rRNA")%>% 
    filter(!is.na(species))
  
# Remove pr2 to make sure that we do not use the full pr2
  # remove(pr2)
  pr2_18S %>% filter(is.na(kingdom))
  
  # pr2_export(pr2_18S, file_name=pr2.env$file_18S_fasta_dada2, file_type="fasta", file_format="dada2")

3 Set up the files

  dir_pr2_update <- "../updates/done - 2020 4.13.0 Silva"
  
  pr2.env$editor <- "D. Vaulot"

  full_path <- function(file_name){str_c(dir_pr2_update,"/", file_name)}

  # file_fasta <- full_path("arb-silva.de_2020-05-18_id824681_tax_silva samples.fasta")
  file_fasta <- full_path("arb-silva.de_2020-05-18_id824681_tax_silva.fasta.gz")
  
# create the directory for taxonomy output
  dir.create(full_path("taxo"), showWarnings = FALSE)

4 Read fasta file

  pr2_update <- Biostrings::readRNAStringSet(file_fasta)

# Convert to DNA
  pr2_update <- Biostrings::DNAStringSet(pr2_update)

# Convert to Data frame
  pr2_update <- data.frame(sequence=pr2_update) %>%
                rownames_to_column(var = "seq_name") %>% 
                tidyr::separate(seq_name, c("genbank_accession", "start", "end", "silva_taxonomy"), sep = "[. ]", extra = "merge") %>% 
                mutate(pr2_accession = str_c(genbank_accession,".",start,".", end,"_U"))

4.1 Save to xlsx for import into pr2

  write.xlsx(select(pr2_update, -sequence), full_path("pr2_imports_silva.xlsx"))

4.2 Check sequences from PR2 that are assigned that have the same start-end than Silva

PR2 sequences with species: 193084
PR2 sequences with species and silva taxonomy based on pr2_accession: 162055

  test <- pr2_main %>% 
    filter(!is.na(species)) %>% 
    left_join(select(pr2_update, pr2_accession, silva_taxonomy)) %>% 
    filter(!is.na(silva_taxonomy))

4.3 Compare with sequences in PR2

  str_c("Number of updated sequences that are not present in PR2 : ", 
        nrow(filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))))
  
  str_c("Number of updated sequences that are not active in PR2 : ", 
        nrow(filter(pr2_update, (genbank_accession %in% pr2_18S_removed$genbank_accession))))

   str_c("Sequences duplicated  - df ") 
   pr2_update %>% 
     count(genbank_accession) %>% 
     filter(n > 1)

5 Old sequences

 pr2_metadata_old <- pr2_update %>% 
  filter(genbank_accession %in% pr2$genbank_accession) %>% 
  select(genbank_accession, silva_taxonomy)

6 New sequences

6.1 Filter new sequences

pr2_new <- pr2_update %>% 
  filter(!(genbank_accession %in% pr2$genbank_accession)) %>% 
  mutate(sequence_length = str_length(sequence),
         ambiguities = str_count(sequence,pattern=pr2.env$ambig_regex) ) %>% 
  filter((sequence_length >= pr2.env$sequence_length_min) & 
         (ambiguities <= pr2.env$ambiguities_max)) %>% 
  filter(start < 20000) # to avoid too big sequences from genomes
  
pr2_genbank_new <- pr2_new %>% 
  select(genbank_accession)

6.2 Save the list of new sequences

saveRDS(pr2_genbank_new, full_path("pr2_genbank_new.rds"))

6.3 Get some statistics

 ggplot(pr2_new, aes(x=sequence_length)) +
  geom_histogram()

  pr2_new %>%  
  tidyr::separate(silva_taxonomy, c("taxo1", "taxo2", "taxo3", "taxo4", "taxo5"), sep = "[;]", extra = "merge") %>% 
  count(taxo1, taxo2, taxo3, taxo4) %>% 
  arrange(-n)

6.4 Alveolata

  silva_alveolata <- pr2_new %>% 
    filter(str_detect(silva_taxonomy, "Alveolata")) %>% 
    mutate(seq_name = str_c(genbank_accession,".", start, ".", end, " ", silva_taxonomy))

  cat ("Number of Alveolata sequences", nrow(silva_alveolata))
  
  fasta_write(silva_alveolata, full_path("silva_alveolata.fasta.gz"), compress=TRUE, taxo_include = FALSE)

6.5 Run the script PR2 genbank download.R in background

6.6 Read the metadata files and join by genbank accession

metafile <- list()

for (i in c(1:300)) {
  file_name = full_path(str_c("metadata_01/pr2_gb_metadata_",i,".txt"))
  if(file.exists(file_name)) {
  metafile[[i]] <- read_tsv(file_name, 
                            col_types = cols(gb_collection_date = col_character(),
                                             gb_strain =  col_character(), 
                                             gb_isolate =  col_character()),
                            guess_max = 10000) 
  }
}

for (i in c(1:70)) {
  file_name = full_path(str_c("metadata_02/pr2_gb_metadata_2_",i,".txt"))
  if(file.exists(file_name)) {
  metafile[[i+300]] <- read_tsv(file_name, 
                                col_types = cols(gb_collection_date = col_character(),
                                                 gb_strain =  col_character(), 
                                                 gb_isolate =  col_character()),
                                guess_max = 10000) 
  }
}

for (i in c(1:100)) {
  file_name = full_path(str_c("metadata_05/pr2_gb_metadata_5_",i,".txt"))
  if(file.exists(file_name)) {
  metafile[[i+370]] <- read_tsv(file_name, 
                                col_types = cols(gb_collection_date = col_character(),
                                                 gb_strain =  col_character(), 
                                                 gb_isolate =  col_character()),
                                guess_max = 10000) 
  }
}
# The first file serves as a template for the following one when binding rows
gb_metadata <- reduce(metafile, bind_rows)%>% 
  filter(!is.na(gb_organism)) %>% # Remove entry that do not exist
  select(-sequence) %>%  
  distinct() 


pr2_new <- left_join(pr2_new, gb_metadata, by = c("genbank_accession" = "genbank_accession"))

7 Finalization

7.1 Add fields

 pr2_new <- pr2_new %>% 
  # mutate(start=1, end=sequence_length,
  #        label = case_when (intron == "intron" ~ "G",
  #                           TRUE ~"U")) %>% 
  mutate(#sequence=str_sub(gb_sequence,start,end),
         # sequence_length = str_length(sequence),
         label = "U",
         pr2_accession = str_c(genbank_accession,".",start,".", end,"_",label),
         reference_sequence = NA,
         gene=case_when(str_detect(gb_definition, "nucleomorph") ~ "18S_rRNA", 
                               str_detect(gb_definition, "chloroplast") ~ "16S_rRNA", 
                               TRUE ~ "18S_rRNA"),
         organelle = case_when(str_detect(gb_definition, "nucleomorph") ~ "nucleomorph", 
                               str_detect(gb_definition, "chloroplast") ~ "plastid", 
                               TRUE ~ "nucleus"),
         ambiguities = str_count(sequence,pattern=pr2.env$ambig_regex)
         )

 pr2_new$sequence_hash = purrr::map_chr(pr2_new$sequence,digest::sha1)

7.2 Remove sequences with ambiguities or too short

filter(pr2_new, (sequence_length < pr2.env$sequence_length_min)|(ambiguities > pr2.env$ambiguities_max)) %>% 
  arrange(desc(sequence_length)) %>% 
  select(pr2_accession,  sequence_length,ambiguities, gb_definition)

pr2_new<- pr2_new %>% 
  filter((sequence_length >= pr2.env$sequence_length_min) & (ambiguities <= pr2.env$ambiguities_max))

7.3 Check duplicate entries

 pr2_new %>% 
  count(pr2_accession) %>% 
  filter(n > 1)

 pr2_new <- pr2_new %>% 
   distinct(pr2_accession, .keep_all = TRUE)

7.4 Final files for uploading

Final number of sequences: 333,247

 sprintf(" Final number of sequences added:  %d", nrow(pr2_new))
 colnames(pr2_new)

 pr2_update_new <- pr2_new
 
  pr2_main_new <- pr2_update_new %>%  
    select (pr2_accession, genbank_accession, start,end, label, organelle, gene) %>% 
    mutate(added_version = pr2.env$version, edited_by = pr2.env$editor)
  nrow(pr2_main_new)
  
  pr2_sequences_new <- pr2_update_new %>% 
    select(pr2_accession, sequence, sequence_length, ambiguities, sequence_hash)
  nrow(pr2_sequences_new)
  
  pr2_metadata_new <- pr2_update_new %>% 
    filter(!is.na(gb_definition)) %>% 
    select(genbank_accession, starts_with("gb_"),starts_with("pr2_"), starts_with("eukref_"), starts_with("silva_")) %>% 
    select( -pr2_accession) %>% 
    distinct()
  nrow(pr2_metadata_new)

7.5 Save everything to an Excel file

 file_pr2_imports <-  full_path("pr2_imports_final.xlsx")
 onglets <- list(# "pr2_taxo_updated" = pr2_taxo_updated,
                 # "pr2_taxo_remove" = pr2_taxo_remove,
                 #"pr2_main_old" = pr2_main_old, 
                 "pr2_metadata_old" = pr2_metadata_old,
                 "pr2_main_new" = pr2_main_new, 
                 "pr2_sequence_new" = pr2_sequences_new,
                 "pr2_metadata_new" = pr2_metadata_new)
 # write.xlsx(onglets, file_pr2_imports, zoom = 90, firstRow = TRUE, firstCol = TRUE)
 
 write.xlsx(pr2_metadata_old, full_path("pr2_imports_metadata_old.xlsx"), zoom = 90, firstRow = TRUE, firstCol = TRUE)
 write.xlsx(pr2_main_new, full_path("pr2_imports_main_new.xlsx"), zoom = 90, firstRow = TRUE, firstCol = TRUE)
 write.xlsx(pr2_sequences_new, full_path("pr2_imports_sequences_new.xlsx"), zoom = 90, firstRow = TRUE, firstCol = TRUE)
 write.xlsx(pr2_metadata_new, full_path("pr2_imports_metadata_new.xlsx"), zoom = 90, firstRow = TRUE, firstCol = TRUE)

8 One more round of search for metadata

Note: these entries correspond to contigs from metagenomes that point to a single record. They have to be searched one by one (script #6)

pr2_new_6 <- pr2 %>% 
  filter(is.na(pr2_metadata_id)) %>% 
  # slice_head(n = 200) %>% 
  select(pr2_accession, genbank_accession)

pr2_genbank_new_6 <- pr2_new_6 %>% 
  select(genbank_accession)

# saveRDS(pr2_genbank_new_6, full_path("pr2_genbank_new_6.rds"))

8.1 Run the script on server (script_genbank_06.R)

Note that some of the pr2 entries correspond to only one GenBank entries for metagenomes. This has to be changed in the pr2_main table

8.2 Read the metadata files and join by genbank accession

metafile <- list()

for (i in c(1:500)) {
  file_name = full_path(str_c("metadata_06/pr2_gb_metadata_6_",i,".txt"))
  if(file.exists(file_name)) {
  metafile[[i]] <- read_tsv(file_name, 
                            col_types = cols(gb_collection_date = col_character(),
                                             gb_strain =  col_character(), 
                                             gb_isolate =  col_character(),
                                             gb_project_id = col_character(),
                                             gb_specimen_voucher = col_character(),
                                             gb_clone = col_character() ),
                            guess_max = 10000) 
  }
}

# The first file serves as a template for the following one when binding rows
gb_metadata <- reduce(metafile, bind_rows) %>% 
  filter(!is.na(genbank_accession_new))  %>% # Remove all entries for which we could not find a GenBank entry
  distinct() 

write.xlsx(gb_metadata, full_path(str_c("metadata_06/pr2_gb_metadata_6.xlsx")))

# Some entries have the same genbank_accession and different pr2_accession
junk <- gb_metadata %>% 
  count(genbank_accession_old) %>% 
  filter(n> 1) %>% 
  arrange(desc(n))
  
pr2_new_6_metadata <- left_join(pr2_new_6, gb_metadata, by = c("genbank_accession" = "genbank_accession_old")) %>% 
  rename(genbank_accession_old = genbank_accession) %>%
  filter(!is.na(genbank_accession_new))  # Remove all entries for which we could not find a GenBank entry 

pr2_main_6 <- pr2_new_6_metadata %>% 
  select(pr2_accession, genbank_accession_new) %>% 
  rename(genbank_accession = genbank_accession_new)

# Some entries have the same genbank_accession and different pr2_accession
junk <- pr2_main_6 %>% 
  count(genbank_accession) %>% 
  filter(n> 1) %>% 
  arrange(desc(n))

pr2_metadata_6 <- pr2_new_6_metadata %>% 
  select(-pr2_accession, -genbank_accession_old) %>% 
  rename(genbank_accession = genbank_accession_new) %>% 
  filter(!(genbank_accession %in% pr2_metadata$genbank_accession)) %>% 
  distinct()

8.3 Save everything to an Excel file

pr2_main: 70668 entries corresponding to 55512 updates. replace the old genbank_accession by the new one. Some pr2_accession correspond to the same genbank entry for metagenomes, e.g. OBEP000000000 corresponding to 44921 silva sequences.
pr2_medata: 14247 entries

 write.xlsx(pr2_metadata_6, full_path("pr2_imports_metadata_6.xlsx"), zoom = 90, firstRow = TRUE, firstCol = TRUE)
 write.xlsx(pr2_main_6, full_path("pr2_imports_main_6.xlsx"), zoom = 90, firstRow = TRUE, firstCol = TRUE)

9 Get taxonomy of Silva and Genbank sequences from hash value

9.1 First iteration

By mistake I included all sequences including those removed…

pr2_hash <- pr2 %>%  
  count(sequence_hash) %>% 
  filter(n > 1) 

pr2_hash <- pr2 %>% 
  select(pr2_accession, species, sequence_hash) %>% 
  filter(sequence_hash %in% pr2_hash$sequence_hash) %>% 
  arrange(sequence_hash)


pr2_hash_no_species <- pr2_hash %>% 
  filter(is.na(species)) %>% 
  select(-species)

pr2_hash_species <- pr2_hash %>% 
  filter(!is.na(species)) %>% 
  select(-pr2_accession) %>% 
  distinct()
  
  
# Check whether species with same hash have same species
pr2_hash_species %>% 
  distinct(species, sequence_hash) %>% 
  count(species, sequence_hash) %>% 
  filter(n > 1)

# Merge based on hash
  pr2_hash_no_species <- pr2_hash_no_species %>% 
    left_join(pr2_hash_species) %>% 
    filter(!is.na(species)) %>% 
    mutate(edited_remark = "species assigned based on hash value")
 
  
  write.xlsx(pr2_hash_no_species, full_path("pr2_imports_species_hash.xlsx"), zoom = 90, firstRow = TRUE, firstCol = TRUE)

9.2 Second iteration

Remove sequences similar to those already removed 2 sequences removed

pr2_hash_removed <- pr2 %>% 
  filter(!is.na(removed_version)) 

pr2_to_remove <- pr2 %>% 
  filter(is.na(removed_version))%>% 
  filter(sequence_hash %in% pr2_hash_removed$sequence_hash)

10 Use dada2 AssignTaxonomy to assing the taxonomy of the Silva sequences

10.1 Export Fasta file for Silva sequences without taxonomy

186884 sequences

pr2_no_species <- pr2 %>%
  filter(is.na(species)) %>% 
  select(pr2_accession, sequence) %>% 
  rename(seq_name = pr2_accession) %>% 
  filter(!is.na(sequence))
  
fasta_write(pr2_no_species, full_path("pr2_no_species.fasta.gz"), taxo_include = FALSE, compress = TRUE)

10.2 Launch script_assign_01.R on server

Use 4.13.0 file
Break up in pieces of 10 000 sequences

10.3 Build the pr2_assign table

metafile <- list()

for (i in c(1:34)) {
  file_name = full_path(str_c("taxo/pr2_no_species_",sprintf("%03d", i),".dada2.taxo"))
  if(file.exists(file_name)) {
  metafile[[i]] <- read_tsv(file_name, 
                            guess_max = 10000) 
  }
}

# The first file serves as a template for the following one when binding rows
pr2_assign <- reduce(metafile, bind_rows) %>%
  distinct() 

# Save the file
  write.xlsx(pr2_assign, full_path("pr2_imports_silva_assigned.xlsx"), zoom = 90, firstRow = TRUE, firstCol = TRUE)

PR2 version 4.13.0 Adding Silva sequences

PR2 version 4.13.0 Adding Silva sequences

1 Init

2 Read pr2

3 Set up the files

4 Read fasta file

4.1 Save to xlsx for import into pr2

4.2 Check sequences from PR2 that are assigned that have the same start-end than Silva

4.3 Compare with sequences in PR2

5 Old sequences

6 New sequences

6.1 Filter new sequences

6.2 Save the list of new sequences

6.3 Get some statistics

6.4 Alveolata

6.5 Run the script PR2 genbank download.R in background

6.6 Read the metadata files and join by genbank accession

7 Finalization

7.1 Add fields

7.2 Remove sequences with ambiguities or too short

7.3 Check duplicate entries

7.4 Final files for uploading

7.5 Save everything to an Excel file

8 One more round of search for metadata

8.1 Run the script on server (script_genbank_06.R)

8.2 Read the metadata files and join by genbank accession

8.3 Save everything to an Excel file

9 Get taxonomy of Silva and Genbank sequences from hash value

9.1 First iteration

9.2 Second iteration

10 Use dada2 AssignTaxonomy to assing the taxonomy of the Silva sequences

10.1 Export Fasta file for Silva sequences without taxonomy

10.2 Launch script_assign_01.R on server

10.3 Build the pr2_assign table

PR2 version 4.13.0
Adding Silva sequences

PR2 version 4.13.0
Adding Silva sequences