PR2 version 4.12.0
Part A - Plastid 16S

1 Initialize libraries
2 Read pr2
3 PhytoRef Euk
4 PhytoRef Cyanos
5 Checking the taxonomy of the unknown sequences using Wang assigner - 927 sequences reassigned
6 Save all the files

1 Initialize libraries

By default do not run any of the chunks when knitting

  knitr::opts_chunk$set(eval=FALSE,
                        cache=FALSE)

Load the variables common to the different scripts and the necessary libraries

  source('PR2_init.R', echo=TRUE)

2 Read pr2

  pr2  <- pr2_read() %>% dplyr::filter (is.na(removed_version))

  pr2_db_con <- db_connect(db_info("pr2_local"))

  pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy")
  pr2_taxo  <- pr2_taxo %>% 
    filter (is.na(taxo_removed_version))%>% 
    collect()

3 PhytoRef Euk

3.1 PhytoRef read files

Create a table from existing information

Worksheet PhytoRef.xlsx * phytoref : the final phytoref table that will be imported into PR2 * 01-phytoref_download_2019 : contains the tab delimited file availabel from Roscoff PhytoRef site * 02-taxonomy_table_21_11_2014 : contains taxonomy table from 2014 with some metadata * 04-phytoref_new_seq : contains new sequence deposited to Genbank and that are not in the current PhytoRef

Fasta PhytoRef_with_taxonomy.fasta * Fasta file with taxonomy

full_path <- function(file_name) { str_c("../updates/2019 PhytoRef/", file_name)}

excel_file <- full_path("PhytoRef.xlsx")
fasta_file <- full_path("PhytoRef original database/PhytoRef_with_taxonomy.fasta")

# Tab download
phytoref_01 <- read_xlsx(excel_file, sheet = "01-phytoref_download_2019")

# Old taxo table
phytoref_02 <- read_xlsx(excel_file, sheet = "02-taxonomy_table_21_11_2014", range = "A1:W6675") %>% 
  mutate(species_02 = str_replace_all(species_02, " ","_"))

# Fasta file file taxonomy
phytoref_03 <- fasta_read(fasta_file) %>% 
  rename(sequence_03=sequence)

# Sequences that have been submitted to GenBank
phytoref_04 <- read_xlsx(excel_file, sheet = "04-phytoref_new_seq", range = "A1:F337") %>% 
  mutate(species_04 = str_replace_all(species_04, " ","_"))


df1 <- data.frame(str_split_fixed(phytoref_03$seq_name, "#", 2), stringsAsFactors = FALSE)  
df2 <- data.frame(str_split_fixed(df1$X2, "[|]", 11) , stringsAsFactors = FALSE)

df1 <- df1 %>% mutate (phytoref_id=as.numeric(X1)) %>% select(phytoref_id)
df2 <- df2 %>% select(genbank_accession_03 = X1, species_03=X11) %>% 
  mutate(species_03 = str_replace_all(species_03, " ","_"))

phytoref_03 <- bind_cols(df1,df2)

  

phytoref <- phytoref_01 %>% 
  left_join(phytoref_02) %>% 
  left_join(phytoref_03) %>% 
  left_join(phytoref_04, by=c("strain_01"="strain_04"))

3.2 Basic cleaning

3.2.1 Remove Sequences in phytoref_01 not found in phytoref_02: 10 sequences

These are duplicated sequences
Removed

filter(phytoref, is.na(phytoref_accession_02)) %>% select(phytoref_accession_01)

phytoref <- phytoref %>%  filter(!is.na(phytoref_accession_02))

3.2.2 Sequences from Cyanobacteria: 184 sequences

filter(phytoref, kingdom != "Eukaryota") %>% select(phytoref_accession_01)

3.2.3 Add GenBank Accession number for sequences deposited for the PhytoRef paper

phytoref <- phytoref %>% 
  mutate(genbank_accession_01 = case_when( is.na(genbank_accession_01) ~ genbank_accession_04,
                                           TRUE ~ genbank_accession_01))

3.2.4 Remove sequences with missing GenBank ID : 74

It seems that these sequences never made it to GenBank

filter(phytoref, is.na(genbank_accession_01)) %>% select(phytoref_accession_01)

phytoref <- filter(phytoref, !is.na(genbank_accession_01))

3.2.5 Find contradiction in species : 35

Use species_02 as reference

filter (phytoref, species_02 != species_03) %>% 
  select(phytoref_id, phytoref_accession_01, species_02, species_03, species_04)

phytoref <- phytoref %>% 
  rename(species=species_02)

3.2.6 Split euks and cyanos

phytoref_cyano <- phytoref %>%  filter(kingdom != "Eukaryota")
phytoref <- phytoref %>%  filter(kingdom == "Eukaryota")

3.2.7 Join to PR2 taxonomy table

Join

pr2_taxo <- pr2_taxo %>% 
  select(taxo_id, supergroup_2=supergroup, class_pr2 = class, genus_pr2 = genus, species)

phytoref <- phytoref %>% 
  left_join(pr2_taxo, by = c("species" = "species"))

filter(phytoref, is.na(taxo_id)) %>% 
  select(phytoref_accession_01, supergroup, division, class, genus, species)

3.2.8 Save the Phytoref file

phytoref_out <- phytoref %>% 
  select(-strain_01, phytoref_accession_02, -taxo5, -taxo7, 
         -genbank_accession_02, -species_03, 
         - class_04, - collection_04, - area_04, - genbank_accession_04) %>% 
  arrange(kingdom, supergroup, division, class, order, family, genus, species)
write_tsv (phytoref_out, full_path("PhytoRef_clean.tsv"), na="")

3.3 Update taxonomy in pr2_taxo table

3.3.1 Add new record to taxonomy table for existing genera

Species are written to a new file which is imported into pr2_taxo

 pr2_genus <- pr2_taxo %>% 
  select(kingdom:genus) %>% 
  distinct()

 phytoref_taxo <- read_xlsx(excel_file, sheet = "phytoref_clean") %>% 
  filter(is.na(species)) %>% 
  select(species_old, genus) %>% 
  inner_join(pr2_genus) %>% 
  distinct() %>% 
  rename(species=species_old) %>% 
  mutate(taxo_edited_version="4.12.0", taxo_edited_by="D.Vaulot") %>% 
  write_tsv (full_path("PhytoRef_new_species.tsv"), na="")

3.3.2 Check that now all species are in pr2

phytoref_species <- read_xlsx(excel_file, sheet = "phytoref_clean") %>% 
  select(phytoref_id, species) %>% 
  left_join(pr2_taxo) %>% 
  filter(is.na(genus))
phytoref_species

3.4 Add Genbank metadata

3.4.1 Read the metadata files and join by genbank accession

phytoref_clean <- read_xlsx(excel_file, sheet = "phytoref_clean") %>% 
  rename(genbank_accession = genbank_accession_01, sequence=sequence_01, strain=strain_02) %>% 
  select(phytoref_id:label, genbank_accession, kingdom:genus, species, strain, organelle:comment_2)

metafile <- list()

for (i in c(1:7,12:13,31:39,310:312)) {
  metafile[[i]] <- read_tsv(full_path(str_c("genbank metadata/PhytoRef_metadata_",i,".txt")), 
                            col_type = cols(gb_collection_date = col_character())) %>% 
  mutate(gb_isolate=as.character(gb_isolate))
}
# The first file serves as a template for the following one when binding rows

metadata <- reduce(metafile, bind_rows) %>% 
  select(-sequence, -sequence_length, -contains("c.specimen_voucher")) %>% 
  distinct()

phytoref_meta <- left_join(phytoref_clean, metadata, by = c("genbank_accession" = "genbank_accession"))

write_tsv (phytoref_meta, full_path("PhytoRef_metadata.tsv"), na="")

3.4.2 Remove sequences with no GenBank Accession number : 267 sequences

These sequences correspond to sequences that have been removed from Genbank * by submitter (bad sequence) * because sequence was not done by submitter

 filter(phytoref_meta, is.na(gb_definition) ) %>% 
  select(phytoref_accession_01, class, genus, species) 
 
 phytoref_meta <- phytoref_meta %>%  filter(!is.na(gb_definition) )

3.5 Other checks

3.5.1 Remove sequences that originate from mitochondria : 5 sequences

 filter(phytoref_meta, str_detect(gb_definition, "mitocho") ) %>% 
  select(phytoref_accession_01, class, genus, species,gb_definition) 
 
 phytoref_meta <- phytoref_meta %>%  filter(!is.na(gb_definition) )

3.6 Finalization

3.6.1 Add fields

 phytoref_final <- phytoref_meta %>% 
  rename(sequence_length_01 = sequence_length,
         eukref_specific_host = `comment 1`,
         eukref_biotic_relationship = symbiosis) %>% 
  mutate(pr2_accession = str_c(genbank_accession,".",start,".", end,"_",label),
         reference_sequence = NA,
         gene="16S rRNA",
         organelle = case_when( organelle == "apicoplast" ~ "apicoplast",
                                TRUE ~ "plastid"),
         pr2_sequence_origin = case_when (str_detect(gb_definition, "genom") ~ "genome",
                                          TRUE ~ NA_character_), 
         
         sequence_length = str_length(sequence),
         ambiguities = str_count(sequence,pattern=pr2.env$ambig_regex))

3.6.2 Remove sequences with ambiguities or too short - 102 sequences

# First find those sequence with different sequence length between PhytoRef and computed

filter(phytoref_final, (sequence_length != sequence_length_01)) %>% 
  select(phytoref_accession_01, class, genus, sequence_length, sequence_length_01, species,gb_definition )

filter(phytoref_final, (sequence_length < pr2.env$sequence_length_min)|(ambiguities > pr2.env$ambiguities_max)) %>% 
  arrange(desc(sequence_length)) %>% 
  select(phytoref_accession_01, class, genus, sequence_length,ambiguities, species,gb_definition)

phytoref_final<- phytoref_final %>% 
  filter((sequence_length >= pr2.env$sequence_length_min) & (ambiguities <= pr2.env$ambiguities_max))

3.6.3 Remove duplicate entries - 1

 phytoref_final %>% group_by(pr2_accession) %>% summarise(n_seq = n()) %>% filter(n_seq > 1)
 
 phytoref_final <- phytoref_final %>% filter(phytoref_id != 323)

3.6.4 Find how many GenBank sequences have more than one entry - 739 (corresponds to palstid genomes)

 phytoref_final %>% group_by(genbank_accession) %>% summarise(n_seq = n()) %>% filter(n_seq > 1)

3.6.5 Find Genbank already in PR2 - 0

 phytoref_final %>% filter(genbank_accession %in% pr2$genbank_accession)

3.6.6 Final files for uploading

 sprintf(" Final number of 16S plastid sequences:  %d", nrow(phytoref_final))
 colnames(phytoref_final)

 write_tsv (phytoref_final, full_path("PhytoRef_final.tsv"), na="")
 
 pr2_update_new <- phytoref_final
 
  pr2_main_add <- pr2_update_new %>%  
    select (pr2_accession, genbank_accession, start,end, label, species, organelle, gene) %>% 
    mutate(added_version = "4.12.0", edited_by = "Phytoref")
  nrow(pr2_main_add)
  
  pr2_sequences_add <- pr2_update_new %>% 
    select(pr2_accession, sequence, sequence_length, ambiguities)
  nrow(pr2_sequences_add)
  
  pr2_metadata_add <- pr2_update_new %>% 
    select(genbank_accession, starts_with("gb_"),starts_with("pr2_"), starts_with("eukref_")) %>% 
    select( -pr2_accession) %>% 
    distinct()
  nrow(pr2_metadata_add)
  
  # Write to files that are then uploaded into the datbase (easier for error tracking)
    
  write_tsv(pr2_main_add,full_path("pr2_main_add.txt"), na="")
  write_tsv(pr2_sequences_add,full_path("pr2_sequences_add.txt"), na="")
  write_tsv(pr2_metadata_add,full_path("pr2_metadata_add.txt"), na="")

4 PhytoRef Cyanos

4.1 Download the metadata

phytoref_cyano <- phytoref_cyano %>%  
  rename(sequence = sequence_01, strain=strain_01, genbank_accession=genbank_accession_01) %>% 
  select(phytoref_id:label, genbank_accession, strain, kingdom:species) 
  
phytoref_cyano_genbank <- phytoref_cyano %>% 
    select(genbank_accession) %>% 
    distinct()

4.1.1 Save the Phytoref file

write_tsv (phytoref_cyano, full_path("PhytoRef_cyano_clean.tsv"), na="")

4.2 Update taxonomy in pr2_taxo table

Species are written to a new file which is imported into pr2_taxo

 phytoref_taxo <- read_xlsx(excel_file, sheet = "phytoref_cyano_clean") %>% 
  filter(!is.na(kingdom)) %>% 
  distinct(kingdom, supergroup, division, class, order, family, genus, species) %>% 
  mutate(taxo_edited_version="4.12.0", taxo_edited_by="D.Vaulot") %>% 
  write_tsv (full_path("PhytoRef_cyano_new_species.tsv"), na="")

4.3 Add Genbank metadata

phytoref_cyano_clean <- read_xlsx(excel_file, sheet = "phytoref_cyano_clean")

metafile <- list()

for (i in c(1:2)) {
  metafile[[i]] <- read_tsv(full_path(str_c("genbank metadata/PhytoRef_metadata_cyano_",i,".txt")), 
                            col_type = cols(gb_collection_date = col_character())) %>% 
  mutate(gb_isolate=as.character(gb_isolate))
}

metadata <- reduce(metafile, bind_rows) %>% 
  distinct()

phytoref_cyano_meta <- left_join(phytoref_cyano_clean, metadata, by = c("genbank_accession" = "genbank_accession"))

write_tsv (phytoref_cyano_meta, full_path("PhytoRef_cyano_metadata.tsv"), na="")

4.3.1 Remove sequences with no GenBank Accession number : 1 sequence

These sequences correspond to sequences that have been removed from Genbank * by submitter (bad sequence) * because sequence was not done by submitter

 filter(phytoref_cyano_meta, is.na(gb_definition) ) %>% 
  select(phytoref_accession_01, class, genus, species) 
 
 phytoref_cyano_meta <- phytoref_cyano_meta %>%  filter(!is.na(gb_definition) )

4.4 Finalization

4.4.1 Add fields

 phytoref_cyano_final <- phytoref_cyano_meta %>%  
  mutate(pr2_accession = str_c(genbank_accession,".",start,".", end,"_",label),
         reference_sequence = NA,
         gene="16S rRNA",
         organelle = case_when( species == "Paulinella_chromatophora" ~ "chromatophore",
                                TRUE ~ NA_character_),
         pr2_sequence_origin = case_when (str_detect(gb_definition, "genom") ~ "genome",
                                          TRUE ~ NA_character_), 
         sequence_length = str_length(sequence),
         ambiguities = str_count(sequence,pattern=pr2.env$ambig_regex))

4.4.2 Remove sequences with ambiguities or too short - 0 sequences

filter(phytoref_cyano_final, (sequence_length < pr2.env$sequence_length_min)|(ambiguities > pr2.env$ambiguities_max)) %>% 
  arrange(desc(sequence_length)) %>% 
  select(phytoref_accession_01, class, genus, sequence_length,ambiguities, species,gb_definition)

phytoref_cyano_final<- phytoref_cyano_final %>% 
  filter((sequence_length >= pr2.env$sequence_length_min) & (ambiguities <= pr2.env$ambiguities_max))

4.4.3 Remove duplicate entries - 0

 phytoref_cyano_final %>% group_by(pr2_accession) %>% summarise(n_seq = n()) %>% filter(n_seq > 1)

4.4.4 Find how many GenBank sequences have more than one entry - 0

 phytoref_cyano_final %>% group_by(genbank_accession) %>% summarise(n_seq = n()) %>% filter(n_seq > 1)

4.4.5 Find Genbank already in PR2 - 0

 phytoref_cyano_final %>% filter(genbank_accession %in% pr2$genbank_accession)

4.4.6 Final files for uploading

 sprintf(" Final number of 16S plastid sequences:  %d", nrow(phytoref_cyano_final))
 colnames(phytoref_cyano_final)

 write_tsv (phytoref_cyano_final, full_path("phytoref_cyano_final.tsv"), na="")
 
 pr2_update_new <- phytoref_cyano_final
 
  pr2_main_add <- pr2_update_new %>%  
    select (pr2_accession, genbank_accession, start,end, label, species, organelle, gene) %>% 
    mutate(added_version = "4.12.0", edited_by = "Phytoref")
  nrow(pr2_main_add)
  
  pr2_sequences_add <- pr2_update_new %>% 
    select(pr2_accession, sequence, sequence_length, ambiguities)
  nrow(pr2_sequences_add)
  
  pr2_metadata_add <- pr2_update_new %>% 
    select(genbank_accession, starts_with("gb_"),starts_with("pr2_"), starts_with("eukref_")) %>% 
    select( -pr2_accession) %>% 
    distinct()
  nrow(pr2_metadata_add)
  
  # Write to files that are then uploaded into the datbase (easier for error tracking)
    
  write_tsv(pr2_main_add,full_path("pr2_main_add_cyanos.txt"), na="")
  write_tsv(pr2_sequences_add,full_path("pr2_sequences_add_cyanos.txt"), na="")
  write_tsv(pr2_metadata_add,full_path("pr2_metadata_add_cyanos.txt"), na="")

5 Checking the taxonomy of the unknown sequences using Wang assigner - 927 sequences reassigned

5.1 Run Wang classifier (dada2)

Reference database : 16S plastid with known species (do not contain _X in species field)
Query sequences, all those contain _X in the species field

  pr2  <- pr2_read() %>% dplyr::filter (is.na(removed_version))

  pr2_plastid <- pr2 %>%  filter(str_detect(gene,"16S"))
  
  pr2_plastid_known <- pr2_plastid %>% filter (!str_detect(species,"_X"))
  pr2_plastid_unknown <- pr2_plastid %>% filter (str_detect(species,"_X"))
  
  pr2_bad <- pr2_plastid_known %>% filter(str_detect(sequence,"X"))
  


pr2_export(pr2_plastid_unknown, file_name=full_path("plastid_unknown.fasta.gz"))
pr2_export(pr2_plastid_known, file_name=full_path("plastid_known.dada2.gz"), file_format="dada2")

pr2_plastid_unkown_assgined <- dada2_assign(seq_file_name = full_path("plastid_unknown.fasta"),
  ref_file_name = full_path("plastid_known.dada2.gz"))

5.2 Read the classifier file

Rules 1. Sequences classified as Bacteria are removed from database (manually) : 9 2. Sequences that have 90 bootstrap at species level are assigned to species 3. Sequences that have 90 bootstrap at genus level are assigned to genus For the rest the assignation does not change

Look also at sequences for which pr2 assignement and wang assginement disagree…

min_boot = 90

pr2_plastid_unknown_reassigned <- read_tsv(full_path("plastid_unknown.dada2.taxo") ) %>% 
  tidyr::separate(into= c("pr2_accession", "seq_label", str_c("taxo", 1:8) ), col=seq_name, sep = "[|]")

print("Sequence assigned to bacteria and removed from pr2")
pr2_plastid_unknown_reassigned %>%  
  filter(kingdom == "Bacteria")

pr2_plastid_unknown_reassigned <- pr2_plastid_unknown_reassigned %>%  
  filter(kingdom != "Bacteria")

print("Sequence contradiction at supergroup level")
pr2_plastid_unknown_reassigned %>%  
  filter(taxo2 != supergroup) %>% 
  select(pr2_accession, seq_label, taxo2, taxo8, supergroup, species, supergroup_boot, species_boot)

print("Sequence contradiction at division level")
pr2_plastid_unknown_reassigned %>%  
  filter((taxo3 != division)&(taxo2 == supergroup)) %>% 
  select(pr2_accession, seq_label, taxo3, taxo8, division, species, division_boot, species_boot)

print("Sequence contradiction at class level")
pr2_plastid_unknown_reassigned %>%  
  filter((taxo4 != class)&(taxo3 != division)&(taxo2 == supergroup)) %>% 
  select(pr2_accession, seq_label, taxo4, taxo8, class, species, class_boot, species_boot)

pr2_plastid_unknown_reassigned <- pr2_plastid_unknown_reassigned %>%  
  filter(genus_boot >= min_boot) %>% 
  mutate(species_new = case_when(species_boot >= min_boot ~ species,
                                 genus_boot >= min_boot ~ str_c(genus,"_sp."),
                                 TRUE ~ NA_character_) ) %>%
  filter(!is.na(species_new)) 

pr2_plastid_unknown_reassigned %>% 
  select(pr2_accession, seq_label, taxo4, taxo7, taxo8, class, species_new, genus_boot, species_boot) %>% 
  write_tsv(full_path("phytoref_reassigned.tsv"), na="")

# Check new species that are not in pr2 and enter them manually

pr2_plastid_unknown_reassigned %>% filter(!(species_new %in% pr2$species)) %>% 
  count(species_new)

5.3 Update the sequence taxonomy in PR2

pr2_update <- pr2_plastid_unknown_reassigned %>% 
  tidyr::separate(col=pr2_accession, into="genbank_accession", sep="[.]" ) %>%
  select(-species) %>% 
  rename(species = species_new)

pr2_update <- pr2_sequence_reassign(pr2_update)

6 Save all the files

   pr2_export_all()

PR2 version 4.12.0 Part A - Plastid 16S

PR2 version 4.12.0 Part A - Plastid 16S

1 Initialize libraries

2 Read pr2

3 PhytoRef Euk

3.1 PhytoRef read files

3.2 Basic cleaning

3.2.1 Remove Sequences in phytoref_01 not found in phytoref_02: 10 sequences

3.2.2 Sequences from Cyanobacteria: 184 sequences

3.2.3 Add GenBank Accession number for sequences deposited for the PhytoRef paper

3.2.4 Remove sequences with missing GenBank ID : 74

3.2.5 Find contradiction in species : 35

3.2.6 Split euks and cyanos

3.2.7 Join to PR2 taxonomy table

3.2.8 Save the Phytoref file

3.3 Update taxonomy in pr2_taxo table

3.3.1 Add new record to taxonomy table for existing genera

3.3.2 Check that now all species are in pr2

3.4 Add Genbank metadata

3.4.1 Read the metadata files and join by genbank accession

3.4.2 Remove sequences with no GenBank Accession number : 267 sequences

3.5 Other checks

3.5.1 Remove sequences that originate from mitochondria : 5 sequences

3.6 Finalization

3.6.1 Add fields

3.6.2 Remove sequences with ambiguities or too short - 102 sequences

3.6.3 Remove duplicate entries - 1

3.6.4 Find how many GenBank sequences have more than one entry - 739 (corresponds to palstid genomes)

3.6.5 Find Genbank already in PR2 - 0

3.6.6 Final files for uploading

4 PhytoRef Cyanos

4.1 Download the metadata

4.1.1 Save the Phytoref file

4.2 Update taxonomy in pr2_taxo table

4.3 Add Genbank metadata

4.3.1 Remove sequences with no GenBank Accession number : 1 sequence

4.4 Finalization

4.4.1 Add fields

4.4.2 Remove sequences with ambiguities or too short - 0 sequences

4.4.3 Remove duplicate entries - 0

4.4.4 Find how many GenBank sequences have more than one entry - 0

4.4.5 Find Genbank already in PR2 - 0

4.4.6 Final files for uploading

5 Checking the taxonomy of the unknown sequences using Wang assigner - 927 sequences reassigned

5.1 Run Wang classifier (dada2)

5.2 Read the classifier file

5.3 Update the sequence taxonomy in PR2

6 Save all the files

PR2 version 4.12.0
Part A - Plastid 16S

PR2 version 4.12.0
Part A - Plastid 16S