PR2 version 4.12.0
Part A - Plastid 16S
PR2 version 4.12.0
Part A - Plastid 16S
1 Initialize libraries
By default do not run any of the chunks when knitting
Load the variables common to the different scripts and the necessary libraries
2 Read pr2
3 PhytoRef Euk
3.1 PhytoRef read files
Create a table from existing information
Worksheet PhytoRef.xlsx
* phytoref
: the final phytoref table that will be imported into PR2 * 01-phytoref_download_2019
: contains the tab delimited file availabel from Roscoff PhytoRef site * 02-taxonomy_table_21_11_2014
: contains taxonomy table from 2014 with some metadata * 04-phytoref_new_seq
: contains new sequence deposited to Genbank and that are not in the current PhytoRef
Fasta PhytoRef_with_taxonomy.fasta
* Fasta file with taxonomy
full_path <- function(file_name) { str_c("../updates/2019 PhytoRef/", file_name)}
excel_file <- full_path("PhytoRef.xlsx")
fasta_file <- full_path("PhytoRef original database/PhytoRef_with_taxonomy.fasta")
# Tab download
phytoref_01 <- read_xlsx(excel_file, sheet = "01-phytoref_download_2019")
# Old taxo table
phytoref_02 <- read_xlsx(excel_file, sheet = "02-taxonomy_table_21_11_2014", range = "A1:W6675") %>%
mutate(species_02 = str_replace_all(species_02, " ","_"))
# Fasta file file taxonomy
phytoref_03 <- fasta_read(fasta_file) %>%
rename(sequence_03=sequence)
# Sequences that have been submitted to GenBank
phytoref_04 <- read_xlsx(excel_file, sheet = "04-phytoref_new_seq", range = "A1:F337") %>%
mutate(species_04 = str_replace_all(species_04, " ","_"))
df1 <- data.frame(str_split_fixed(phytoref_03$seq_name, "#", 2), stringsAsFactors = FALSE)
df2 <- data.frame(str_split_fixed(df1$X2, "[|]", 11) , stringsAsFactors = FALSE)
df1 <- df1 %>% mutate (phytoref_id=as.numeric(X1)) %>% select(phytoref_id)
df2 <- df2 %>% select(genbank_accession_03 = X1, species_03=X11) %>%
mutate(species_03 = str_replace_all(species_03, " ","_"))
phytoref_03 <- bind_cols(df1,df2)
phytoref <- phytoref_01 %>%
left_join(phytoref_02) %>%
left_join(phytoref_03) %>%
left_join(phytoref_04, by=c("strain_01"="strain_04"))
3.2 Basic cleaning
3.2.1 Remove Sequences in phytoref_01 not found in phytoref_02: 10 sequences
- These are duplicated sequences
- Removed
3.2.2 Sequences from Cyanobacteria: 184 sequences
3.2.3 Add GenBank Accession number for sequences deposited for the PhytoRef paper
3.2.4 Remove sequences with missing GenBank ID : 74
- It seems that these sequences never made it to GenBank
3.2.5 Find contradiction in species : 35
Use species_02 as reference
3.2.6 Split euks and cyanos
3.2.7 Join to PR2 taxonomy table
- Join
pr2_taxo <- pr2_taxo %>%
select(taxo_id, supergroup_2=supergroup, class_pr2 = class, genus_pr2 = genus, species)
phytoref <- phytoref %>%
left_join(pr2_taxo, by = c("species" = "species"))
filter(phytoref, is.na(taxo_id)) %>%
select(phytoref_accession_01, supergroup, division, class, genus, species)
3.2.8 Save the Phytoref file
phytoref_out <- phytoref %>%
select(-strain_01, phytoref_accession_02, -taxo5, -taxo7,
-genbank_accession_02, -species_03,
- class_04, - collection_04, - area_04, - genbank_accession_04) %>%
arrange(kingdom, supergroup, division, class, order, family, genus, species)
write_tsv (phytoref_out, full_path("PhytoRef_clean.tsv"), na="")
3.3 Update taxonomy in pr2_taxo table
3.3.1 Add new record to taxonomy table for existing genera
- Species are written to a new file which is imported into pr2_taxo
pr2_genus <- pr2_taxo %>%
select(kingdom:genus) %>%
distinct()
phytoref_taxo <- read_xlsx(excel_file, sheet = "phytoref_clean") %>%
filter(is.na(species)) %>%
select(species_old, genus) %>%
inner_join(pr2_genus) %>%
distinct() %>%
rename(species=species_old) %>%
mutate(taxo_edited_version="4.12.0", taxo_edited_by="D.Vaulot") %>%
write_tsv (full_path("PhytoRef_new_species.tsv"), na="")
3.3.2 Check that now all species are in pr2
3.4 Add Genbank metadata
3.4.1 Read the metadata files and join by genbank accession
phytoref_clean <- read_xlsx(excel_file, sheet = "phytoref_clean") %>%
rename(genbank_accession = genbank_accession_01, sequence=sequence_01, strain=strain_02) %>%
select(phytoref_id:label, genbank_accession, kingdom:genus, species, strain, organelle:comment_2)
metafile <- list()
for (i in c(1:7,12:13,31:39,310:312)) {
metafile[[i]] <- read_tsv(full_path(str_c("genbank metadata/PhytoRef_metadata_",i,".txt")),
col_type = cols(gb_collection_date = col_character())) %>%
mutate(gb_isolate=as.character(gb_isolate))
}
# The first file serves as a template for the following one when binding rows
metadata <- reduce(metafile, bind_rows) %>%
select(-sequence, -sequence_length, -contains("c.specimen_voucher")) %>%
distinct()
phytoref_meta <- left_join(phytoref_clean, metadata, by = c("genbank_accession" = "genbank_accession"))
write_tsv (phytoref_meta, full_path("PhytoRef_metadata.tsv"), na="")
3.4.2 Remove sequences with no GenBank Accession number : 267 sequences
These sequences correspond to sequences that have been removed from Genbank * by submitter (bad sequence) * because sequence was not done by submitter
3.5 Other checks
3.5.1 Remove sequences that originate from mitochondria : 5 sequences
3.6 Finalization
3.6.1 Add fields
phytoref_final <- phytoref_meta %>%
rename(sequence_length_01 = sequence_length,
eukref_specific_host = `comment 1`,
eukref_biotic_relationship = symbiosis) %>%
mutate(pr2_accession = str_c(genbank_accession,".",start,".", end,"_",label),
reference_sequence = NA,
gene="16S rRNA",
organelle = case_when( organelle == "apicoplast" ~ "apicoplast",
TRUE ~ "plastid"),
pr2_sequence_origin = case_when (str_detect(gb_definition, "genom") ~ "genome",
TRUE ~ NA_character_),
sequence_length = str_length(sequence),
ambiguities = str_count(sequence,pattern=pr2.env$ambig_regex))
3.6.2 Remove sequences with ambiguities or too short - 102 sequences
# First find those sequence with different sequence length between PhytoRef and computed
filter(phytoref_final, (sequence_length != sequence_length_01)) %>%
select(phytoref_accession_01, class, genus, sequence_length, sequence_length_01, species,gb_definition )
filter(phytoref_final, (sequence_length < pr2.env$sequence_length_min)|(ambiguities > pr2.env$ambiguities_max)) %>%
arrange(desc(sequence_length)) %>%
select(phytoref_accession_01, class, genus, sequence_length,ambiguities, species,gb_definition)
phytoref_final<- phytoref_final %>%
filter((sequence_length >= pr2.env$sequence_length_min) & (ambiguities <= pr2.env$ambiguities_max))
3.6.3 Remove duplicate entries - 1
3.6.4 Find how many GenBank sequences have more than one entry - 739 (corresponds to palstid genomes)
3.6.5 Find Genbank already in PR2 - 0
3.6.6 Final files for uploading
sprintf(" Final number of 16S plastid sequences: %d", nrow(phytoref_final))
colnames(phytoref_final)
write_tsv (phytoref_final, full_path("PhytoRef_final.tsv"), na="")
pr2_update_new <- phytoref_final
pr2_main_add <- pr2_update_new %>%
select (pr2_accession, genbank_accession, start,end, label, species, organelle, gene) %>%
mutate(added_version = "4.12.0", edited_by = "Phytoref")
nrow(pr2_main_add)
pr2_sequences_add <- pr2_update_new %>%
select(pr2_accession, sequence, sequence_length, ambiguities)
nrow(pr2_sequences_add)
pr2_metadata_add <- pr2_update_new %>%
select(genbank_accession, starts_with("gb_"),starts_with("pr2_"), starts_with("eukref_")) %>%
select( -pr2_accession) %>%
distinct()
nrow(pr2_metadata_add)
# Write to files that are then uploaded into the datbase (easier for error tracking)
write_tsv(pr2_main_add,full_path("pr2_main_add.txt"), na="")
write_tsv(pr2_sequences_add,full_path("pr2_sequences_add.txt"), na="")
write_tsv(pr2_metadata_add,full_path("pr2_metadata_add.txt"), na="")
4 PhytoRef Cyanos
4.1 Download the metadata
phytoref_cyano <- phytoref_cyano %>%
rename(sequence = sequence_01, strain=strain_01, genbank_accession=genbank_accession_01) %>%
select(phytoref_id:label, genbank_accession, strain, kingdom:species)
phytoref_cyano_genbank <- phytoref_cyano %>%
select(genbank_accession) %>%
distinct()
4.1.1 Save the Phytoref file
4.2 Update taxonomy in pr2_taxo table
- Species are written to a new file which is imported into pr2_taxo
phytoref_taxo <- read_xlsx(excel_file, sheet = "phytoref_cyano_clean") %>%
filter(!is.na(kingdom)) %>%
distinct(kingdom, supergroup, division, class, order, family, genus, species) %>%
mutate(taxo_edited_version="4.12.0", taxo_edited_by="D.Vaulot") %>%
write_tsv (full_path("PhytoRef_cyano_new_species.tsv"), na="")
4.3 Add Genbank metadata
phytoref_cyano_clean <- read_xlsx(excel_file, sheet = "phytoref_cyano_clean")
metafile <- list()
for (i in c(1:2)) {
metafile[[i]] <- read_tsv(full_path(str_c("genbank metadata/PhytoRef_metadata_cyano_",i,".txt")),
col_type = cols(gb_collection_date = col_character())) %>%
mutate(gb_isolate=as.character(gb_isolate))
}
metadata <- reduce(metafile, bind_rows) %>%
distinct()
phytoref_cyano_meta <- left_join(phytoref_cyano_clean, metadata, by = c("genbank_accession" = "genbank_accession"))
write_tsv (phytoref_cyano_meta, full_path("PhytoRef_cyano_metadata.tsv"), na="")
4.3.1 Remove sequences with no GenBank Accession number : 1 sequence
These sequences correspond to sequences that have been removed from Genbank * by submitter (bad sequence) * because sequence was not done by submitter
4.4 Finalization
4.4.1 Add fields
phytoref_cyano_final <- phytoref_cyano_meta %>%
mutate(pr2_accession = str_c(genbank_accession,".",start,".", end,"_",label),
reference_sequence = NA,
gene="16S rRNA",
organelle = case_when( species == "Paulinella_chromatophora" ~ "chromatophore",
TRUE ~ NA_character_),
pr2_sequence_origin = case_when (str_detect(gb_definition, "genom") ~ "genome",
TRUE ~ NA_character_),
sequence_length = str_length(sequence),
ambiguities = str_count(sequence,pattern=pr2.env$ambig_regex))
4.4.2 Remove sequences with ambiguities or too short - 0 sequences
filter(phytoref_cyano_final, (sequence_length < pr2.env$sequence_length_min)|(ambiguities > pr2.env$ambiguities_max)) %>%
arrange(desc(sequence_length)) %>%
select(phytoref_accession_01, class, genus, sequence_length,ambiguities, species,gb_definition)
phytoref_cyano_final<- phytoref_cyano_final %>%
filter((sequence_length >= pr2.env$sequence_length_min) & (ambiguities <= pr2.env$ambiguities_max))
4.4.3 Remove duplicate entries - 0
4.4.4 Find how many GenBank sequences have more than one entry - 0
4.4.5 Find Genbank already in PR2 - 0
4.4.6 Final files for uploading
sprintf(" Final number of 16S plastid sequences: %d", nrow(phytoref_cyano_final))
colnames(phytoref_cyano_final)
write_tsv (phytoref_cyano_final, full_path("phytoref_cyano_final.tsv"), na="")
pr2_update_new <- phytoref_cyano_final
pr2_main_add <- pr2_update_new %>%
select (pr2_accession, genbank_accession, start,end, label, species, organelle, gene) %>%
mutate(added_version = "4.12.0", edited_by = "Phytoref")
nrow(pr2_main_add)
pr2_sequences_add <- pr2_update_new %>%
select(pr2_accession, sequence, sequence_length, ambiguities)
nrow(pr2_sequences_add)
pr2_metadata_add <- pr2_update_new %>%
select(genbank_accession, starts_with("gb_"),starts_with("pr2_"), starts_with("eukref_")) %>%
select( -pr2_accession) %>%
distinct()
nrow(pr2_metadata_add)
# Write to files that are then uploaded into the datbase (easier for error tracking)
write_tsv(pr2_main_add,full_path("pr2_main_add_cyanos.txt"), na="")
write_tsv(pr2_sequences_add,full_path("pr2_sequences_add_cyanos.txt"), na="")
write_tsv(pr2_metadata_add,full_path("pr2_metadata_add_cyanos.txt"), na="")
5 Checking the taxonomy of the unknown sequences using Wang assigner - 927 sequences reassigned
5.1 Run Wang classifier (dada2)
- Reference database : 16S plastid with known species (do not contain _X in
species
field) - Query sequences, all those contain _X in the
species
field
pr2 <- pr2_read() %>% dplyr::filter (is.na(removed_version))
pr2_plastid <- pr2 %>% filter(str_detect(gene,"16S"))
pr2_plastid_known <- pr2_plastid %>% filter (!str_detect(species,"_X"))
pr2_plastid_unknown <- pr2_plastid %>% filter (str_detect(species,"_X"))
pr2_bad <- pr2_plastid_known %>% filter(str_detect(sequence,"X"))
pr2_export(pr2_plastid_unknown, file_name=full_path("plastid_unknown.fasta.gz"))
pr2_export(pr2_plastid_known, file_name=full_path("plastid_known.dada2.gz"), file_format="dada2")
pr2_plastid_unkown_assgined <- dada2_assign(seq_file_name = full_path("plastid_unknown.fasta"),
ref_file_name = full_path("plastid_known.dada2.gz"))
5.2 Read the classifier file
Rules 1. Sequences classified as Bacteria are removed from database (manually) : 9 2. Sequences that have 90 bootstrap at species level are assigned to species 3. Sequences that have 90 bootstrap at genus level are assigned to genus For the rest the assignation does not change
Look also at sequences for which pr2 assignement and wang assginement disagreeā¦
min_boot = 90
pr2_plastid_unknown_reassigned <- read_tsv(full_path("plastid_unknown.dada2.taxo") ) %>%
tidyr::separate(into= c("pr2_accession", "seq_label", str_c("taxo", 1:8) ), col=seq_name, sep = "[|]")
print("Sequence assigned to bacteria and removed from pr2")
pr2_plastid_unknown_reassigned %>%
filter(kingdom == "Bacteria")
pr2_plastid_unknown_reassigned <- pr2_plastid_unknown_reassigned %>%
filter(kingdom != "Bacteria")
print("Sequence contradiction at supergroup level")
pr2_plastid_unknown_reassigned %>%
filter(taxo2 != supergroup) %>%
select(pr2_accession, seq_label, taxo2, taxo8, supergroup, species, supergroup_boot, species_boot)
print("Sequence contradiction at division level")
pr2_plastid_unknown_reassigned %>%
filter((taxo3 != division)&(taxo2 == supergroup)) %>%
select(pr2_accession, seq_label, taxo3, taxo8, division, species, division_boot, species_boot)
print("Sequence contradiction at class level")
pr2_plastid_unknown_reassigned %>%
filter((taxo4 != class)&(taxo3 != division)&(taxo2 == supergroup)) %>%
select(pr2_accession, seq_label, taxo4, taxo8, class, species, class_boot, species_boot)
pr2_plastid_unknown_reassigned <- pr2_plastid_unknown_reassigned %>%
filter(genus_boot >= min_boot) %>%
mutate(species_new = case_when(species_boot >= min_boot ~ species,
genus_boot >= min_boot ~ str_c(genus,"_sp."),
TRUE ~ NA_character_) ) %>%
filter(!is.na(species_new))
pr2_plastid_unknown_reassigned %>%
select(pr2_accession, seq_label, taxo4, taxo7, taxo8, class, species_new, genus_boot, species_boot) %>%
write_tsv(full_path("phytoref_reassigned.tsv"), na="")
# Check new species that are not in pr2 and enter them manually
pr2_plastid_unknown_reassigned %>% filter(!(species_new %in% pr2$species)) %>%
count(species_new)