PR2 version 4.12.0
Part B - Plastid 16S

1 Initialize libraries
2 Read pr2
3 Chaetoceros

1 Initialize libraries

By default do not run any of the chunks when knitting

  knitr::opts_chunk$set(eval=FALSE,
                        cache=TRUE)

Load the variables common to the different scripts and the necessary libraries

  source('PR2_init.R', echo=TRUE)

2 Read pr2

  pr2  <- pr2_read() %>% dplyr::filter (is.na(removed_version))

  pr2_db_con <- db_connect(db_info("pr2_local"))

  pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy")
  pr2_taxo  <- pr2_taxo %>% 
    filter (is.na(taxo_removed_version))%>% 
    collect()

3 Chaetoceros

3.1 Read Excel file

Create a table from existing information

Worksheet Chaetoceros Gaonkar 2018.xlsx

Fasta PhytoRef_with_taxonomy.fasta * Fasta file with taxonomy

full_path <- function(file_name) { str_c("../updates/2019 Chaetoceros Kooistra/", file_name)}

excel_file <- full_path("Chaetoceros Gaonkar 2018.xlsx")

# Tab download
pr2_new <- read_xlsx(excel_file, sheet = "chaetoceros")

3.2 Basic cleaning

3.2.1 Any sequence repeated ?

pr2_new %>% count(genbank_accession) %>% filter(n>1)

3.3 Update taxonomy in pr2_taxo table

3.3.1 Find species not present in pr2 and export

Species are written to a new file which is imported into pr2_taxo

pr2_taxo_new <- pr2_new %>% 
  select(kingdom:family,genus, species) %>% 
  filter(!(species %in% pr2_taxo$species)) %>% 
  distinct() %>% 
  mutate(taxo_edited_version="4.12.0", taxo_edited_by="C.C. Gaonkar", reference="Gaonkar et al. 2019 PLOSOne") 
 
write_tsv (pr2_taxo_new, full_path("pr2_new_species.txt"), na="")

3.4 Add Genbank metadata

3.4.1 Find genbank sequences not present in PR2

pr2_genbank_new <- pr2_new %>% 
  select(genbank_accession) %>% 
  filter(!(genbank_accession %in% pr2$genbank_accession))

3.4.2 Run the script `"`PR2 genbank dowmnload.R` in background

3.4.3 Read the metadata files and join by genbank accession

metafile <- list()

for (i in c(1)) {
  metafile[[i]] <- read_tsv(full_path(str_c("pr2_gb_metadata_",i,".txt")), 
                            col_type = cols(gb_collection_date = col_character())) %>% 
  mutate(gb_isolate=as.character(gb_isolate))
}
# The first file serves as a template for the following one when binding rows
gb_metadata <- reduce(metafile, bind_rows) %>%  
  distinct()

pr2_new_2 <- left_join(pr2_new, gb_metadata, by = c("genbank_accession" = "genbank_accession"))

3.4.4 Read the features file and join

gb_features <- read_tsv(full_path("pr2_gb_features.txt")) %>% 
  filter(type== "intron") %>% 
  select(genbank_accession, type) %>% 
  rename(intron = type) %>% 
  distinct()

pr2_new_2 <- left_join(pr2_new_2, gb_features, by = c("genbank_accession" = "genbank_accession"))

write_tsv (pr2_new_2, full_path("pr2_new.txt"), na="")

3.5 Finalization

3.5.1 Select new sequences only

pr2_new_3 <- pr2_new_2 %>% filter(!is.na(gb_definition))

3.5.2 Add fields

 pr2_new_4 <- pr2_new_3 %>% 
  mutate(start=1, end=sequence_length,
         label = case_when (intron == "intron" ~ "G",
                            TRUE ~"U")) %>% 
  mutate(pr2_accession = str_c(genbank_accession,".",start,".", end,"_",label),
         reference_sequence = NA,
         gene="18S rRNA",
         organelle = "nucleus",
         ambiguities = str_count(sequence,pattern=pr2.env$ambig_regex))

3.5.3 Remove sequences with ambiguities or too short - 102 sequences

filter(pr2_new_4, (sequence_length < pr2.env$sequence_length_min)|(ambiguities > pr2.env$ambiguities_max)) %>% 
  arrange(desc(sequence_length)) %>% 
  select(pr2_accession, class, genus, sequence_length,ambiguities, species,gb_definition)

pr2_new_5<- pr2_new_4 %>% 
  filter((sequence_length >= pr2.env$sequence_length_min) & (ambiguities <= pr2.env$ambiguities_max))

3.5.4 Check duplicate entries

 pr2_new_5 %>% group_by(pr2_accession) %>% summarise(n_seq = n()) %>% filter(n_seq > 1)

3.5.5 Final files for uploading

 sprintf(" Final number of sequences added:  %d", nrow(pr2_new_5))
 colnames(pr2_new_5)

 write_tsv (pr2_new_5, full_path("pr2_new_final.tsv"), na="")
 
 pr2_update_new <- pr2_new_5
 
  pr2_main_add <- pr2_update_new %>%  
    select (pr2_accession, genbank_accession, start,end, label, species, organelle, gene) %>% 
    mutate(added_version = "4.12.0", edited_by = "G. Gaonkar")
  nrow(pr2_main_add)
  
  pr2_sequences_add <- pr2_update_new %>% 
    select(pr2_accession, sequence, sequence_length, ambiguities)
  nrow(pr2_sequences_add)
  
  pr2_metadata_add <- pr2_update_new %>% 
    select(genbank_accession, starts_with("gb_"),starts_with("pr2_"), starts_with("eukref_")) %>% 
    select( -pr2_accession) %>% 
    distinct()
  nrow(pr2_metadata_add)
  
  # Write to files that are then uploaded into the datbase (easier for error tracking)
    
  write_tsv(pr2_main_add,full_path("pr2_add_main.txt"), na="")
  write_tsv(pr2_sequences_add,full_path("pr2_add_sequences.txt"), na="")
  write_tsv(pr2_metadata_add,full_path("pr2_add_metadata.txt"), na="")

PR2 version 4.12.0 Part B - Plastid 16S

PR2 version 4.12.0 Part B - Plastid 16S

1 Initialize libraries

2 Read pr2

3 Chaetoceros

3.1 Read Excel file

3.2 Basic cleaning

3.2.1 Any sequence repeated ?

3.3 Update taxonomy in pr2_taxo table

3.3.1 Find species not present in pr2 and export

3.4 Add Genbank metadata

3.4.1 Find genbank sequences not present in PR2

3.4.2 Run the script "PR2 genbank dowmnload.R` in background

3.4.3 Read the metadata files and join by genbank accession

3.4.4 Read the features file and join

3.5 Finalization

3.5.1 Select new sequences only

3.5.2 Add fields

3.5.3 Remove sequences with ambiguities or too short - 102 sequences

3.5.4 Check duplicate entries

3.5.5 Final files for uploading

PR2 version 4.12.0
Part B - Plastid 16S

PR2 version 4.12.0
Part B - Plastid 16S

3.4.2 Run the script `"`PR2 genbank dowmnload.R` in background