PR2 version 4.12.0
Part B - Plastid 16S
1 Initialize libraries
By default do not run any of the chunks when knitting
Load the variables common to the different scripts and the necessary libraries
2 Read pr2
3 Chaetoceros
3.1 Read Excel file
Create a table from existing information
Worksheet Chaetoceros Gaonkar 2018.xlsx
Fasta PhytoRef_with_taxonomy.fasta
* Fasta file with taxonomy
3.2 Basic cleaning
3.2.1 Any sequence repeated ?
3.3 Update taxonomy in pr2_taxo table
3.3.1 Find species not present in pr2 and export
- Species are written to a new file which is imported into pr2_taxo
pr2_taxo_new <- pr2_new %>%
select(kingdom:family,genus, species) %>%
filter(!(species %in% pr2_taxo$species)) %>%
distinct() %>%
mutate(taxo_edited_version="4.12.0", taxo_edited_by="C.C. Gaonkar", reference="Gaonkar et al. 2019 PLOSOne")
write_tsv (pr2_taxo_new, full_path("pr2_new_species.txt"), na="")
3.4 Add Genbank metadata
3.4.1 Find genbank sequences not present in PR2
3.4.2 Run the script "
PR2 genbank dowmnload.R` in background
3.4.3 Read the metadata files and join by genbank accession
metafile <- list()
for (i in c(1)) {
metafile[[i]] <- read_tsv(full_path(str_c("pr2_gb_metadata_",i,".txt")),
col_type = cols(gb_collection_date = col_character())) %>%
mutate(gb_isolate=as.character(gb_isolate))
}
# The first file serves as a template for the following one when binding rows
gb_metadata <- reduce(metafile, bind_rows) %>%
distinct()
pr2_new_2 <- left_join(pr2_new, gb_metadata, by = c("genbank_accession" = "genbank_accession"))
3.4.4 Read the features file and join
gb_features <- read_tsv(full_path("pr2_gb_features.txt")) %>%
filter(type== "intron") %>%
select(genbank_accession, type) %>%
rename(intron = type) %>%
distinct()
pr2_new_2 <- left_join(pr2_new_2, gb_features, by = c("genbank_accession" = "genbank_accession"))
write_tsv (pr2_new_2, full_path("pr2_new.txt"), na="")
3.5 Finalization
3.5.1 Select new sequences only
3.5.2 Add fields
pr2_new_4 <- pr2_new_3 %>%
mutate(start=1, end=sequence_length,
label = case_when (intron == "intron" ~ "G",
TRUE ~"U")) %>%
mutate(pr2_accession = str_c(genbank_accession,".",start,".", end,"_",label),
reference_sequence = NA,
gene="18S rRNA",
organelle = "nucleus",
ambiguities = str_count(sequence,pattern=pr2.env$ambig_regex))
3.5.3 Remove sequences with ambiguities or too short - 102 sequences
filter(pr2_new_4, (sequence_length < pr2.env$sequence_length_min)|(ambiguities > pr2.env$ambiguities_max)) %>%
arrange(desc(sequence_length)) %>%
select(pr2_accession, class, genus, sequence_length,ambiguities, species,gb_definition)
pr2_new_5<- pr2_new_4 %>%
filter((sequence_length >= pr2.env$sequence_length_min) & (ambiguities <= pr2.env$ambiguities_max))
3.5.4 Check duplicate entries
3.5.5 Final files for uploading
sprintf(" Final number of sequences added: %d", nrow(pr2_new_5))
colnames(pr2_new_5)
write_tsv (pr2_new_5, full_path("pr2_new_final.tsv"), na="")
pr2_update_new <- pr2_new_5
pr2_main_add <- pr2_update_new %>%
select (pr2_accession, genbank_accession, start,end, label, species, organelle, gene) %>%
mutate(added_version = "4.12.0", edited_by = "G. Gaonkar")
nrow(pr2_main_add)
pr2_sequences_add <- pr2_update_new %>%
select(pr2_accession, sequence, sequence_length, ambiguities)
nrow(pr2_sequences_add)
pr2_metadata_add <- pr2_update_new %>%
select(genbank_accession, starts_with("gb_"),starts_with("pr2_"), starts_with("eukref_")) %>%
select( -pr2_accession) %>%
distinct()
nrow(pr2_metadata_add)
# Write to files that are then uploaded into the datbase (easier for error tracking)
write_tsv(pr2_main_add,full_path("pr2_add_main.txt"), na="")
write_tsv(pr2_sequences_add,full_path("pr2_add_sequences.txt"), na="")
write_tsv(pr2_metadata_add,full_path("pr2_add_metadata.txt"), na="")