PR2 version 4.11.0
Chlorophyta - Mamiellophyceae

By default do not run any of the chunks when knitting

  knitr::opts_chunk$set(eval=FALSE)

Load the variables common to the different scripts and the necessary libraries

  source('PR2_init.R', echo=FALSE)

1 Main changes

Version 4.11.0

  • Update Mamiellophyceae taxonomy

2 References used

  • Tragin, M. & Vaulot, D. 2018. Novel diversity within marine Mamiellophyceae (Chlorophyta) unveiled by metabarcoding. bioRxiv.

3 Read pr2

Note : Now use the local pr2 database to avoid connection problems

# Info for the local dabase 
  pr2_db <- db_info("pr2_local")

  pr2_db_con <- db_connect(pr2_db)
  pr2_main <- tbl(pr2_db_con, "pr2_main") %>% collect()
  # pr2_main  <- pr2_main %>% filter (is.na(removed_version))

  pr2_seq <- tbl(pr2_db_con, "pr2_sequences")%>% collect()

  pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy")
  pr2_taxo  <- pr2_taxo %>% filter (is.na(taxo_removed_version))%>% collect()

  pr2_metadata <- tbl(pr2_db_con, "pr2_metadata")%>% collect()

# Join the tables and keep only sequences that are not removed

  pr2 <- left_join(pr2_main, pr2_taxo, by = c("species"="species"))
  pr2 <- left_join (pr2, pr2_seq)
  pr2 <- left_join (pr2, pr2_metadata)
  db_disconnect(pr2_db_con)
  
  pr2  <- pr2 %>% filter (is.na(removed_version))

4 Mamiellophyceae

4.1 Define annotator

  pr2.env$editor = "Vaulot D." 

4.2 Define the file names

    dir_pr2_update <- "../updates/2018 Mamiello/"
    add_dir <- function(file_name) str_c(dir_pr2_update, file_name)
    

    file_pr2_update <- add_dir("PR2 Mamiello.xlsx")
    file_pr2_update_taxo <- add_dir("taxo.txt")
    file_pr2_update_new_features <- add_dir("pr2_new_sequences_features.txt")
    file_pr2_not_updated <- add_dir("pr2_sequences_not_updated.txt")
    file_pr2_update_new <- add_dir("pr2_update_new.txt")

4.3 Export Existing sequences

  pr2_selected <- pr2 %>% 
    filter(genus %in% c("Micromonas", "Ostreococcus", "Mantoniella"))%>% 
    arrange(species)
  pr2_selected_excel <- pr2_selected %>% 
    select(pr2_accession, genbank_accession, start, end, kingdom:genus, species,gb_strain, gb_clone) 
  write_tsv(pr2_selected_excel, str_c(dir_pr2_update, "/pr2_mamiello.txt"), na="")
  pr2_export(pr2_selected, str_c(dir_pr2_update, "/pr2_mamiello.fasta.gz"), file_type = "fasta" , file_format = "fasta_taxo_short")
  pr2_export(pr2_selected, str_c(dir_pr2_update, "/pr2_mamiello_dada2.fasta.gz"), file_type = "fasta" , file_format = "dada2")

4.4 Assign the sequences

Strategy

  • Principle: Use the Wang assigner as implementerd in dada2
  • Use as reference the alignement with sequences and barcodes from Margot OSD paper
  • It is necessary to edit the sequences names to be compatible with dada2 >species_name (do not forget ; at the end)
  • The pr2 sequences are aligned with the refernce sequence and the V4 region is extracted.
  • Run the dada2 routine for assignement
  • Check the sequences which have only a part of the V4 region, in general they cannot be assigned
  • Any sequence for which the bootstrap value is below 60 should be checked manually

Note : the extraction of the V4 region could be done using primers (could be a problem for sequences starting after V4 region)

  dada2_assign(add_dir("mantoniella_pr2_V4.fasta"), ref_file_name = add_dir("mantoniella_ref.fasta"),tax_levels = c("genus","species"))
  dada2_assign(add_dir("ostreococcus_pr2_V4.fasta"), ref_file_name = add_dir("ostreococcus_ref.fasta"),tax_levels = c("species"))
  dada2_assign(add_dir("micromonas_pr2_V4.fasta"), ref_file_name = add_dir("micromonas_ref.fasta"),tax_levels = c("species"))

4.5 Read the updated Excel file - Need to put guess_max at the max

  pr2_update <- read_excel(file_pr2_update, sheet="pr2_update", col_names = TRUE, guess_max=200000)
  
  # Only keep the first columns
  pr2_update <- pr2_update %>%  select(pr2_accession:genus)

5 Construct taxonomy and compare to existing taxonomy

5.1 Construct a taxonomy file to import into Excel (taxo.txt file)

  • Check manually in the database what needs to be updated
  # Construct the taxonomy file  
    pr2_update_taxo <- pr2_update %>% 
      group_by_(.dots=pr2.env$taxo_levels) %>% 
      summarise(sequence_number = n()) %>% 
      ungroup()

  #  Check it for duplicate names and duplicate parents
    pr2_taxo_check (select(pr2_update_taxo, kingdom:species), dir_pr2_update)
  
  # Compare to existing taxonomy
    pr2_update_taxo_compared <- left_join (pr2_update_taxo, pr2_taxo, by = c("species"="species")) %>%
                                 rename_ ("kingdom"="kingdom.x", "supergroup"="supergroup.x", "division" = "division.x",
                                          "class" = "class.x", "order" = "order.x", "family"="family.x", "genus" = "genus.x") %>%
                                  arrange(species)
    
  # Write to a file to examine with Excel  
    write_tsv(pr2_update_taxo_compared, file_pr2_update_taxo, na = "")

5.2 Update the entries that are already in the taxo table

  • 1 species
  • These entries have stuff in the taxo.y columns
  • Note that the version and the editors have to be changed in the dv_function_pr2 file
  • Note that the function update other species belonging to the same genera so that they have the same upstream taxonomy
  pr2_taxo_update <- pr2_update_taxo_compared %>% filter(!(is.na(kingdom.y))) %>%
                            select(kingdom:species) 
  pr2_taxo_update <- pr2_taxo_update(pr2_taxo_update, method = "update")
  
  # Check which lines did not work
  filter( pr2_taxo_update, query_result==0)

5.3 Append the new entries

  • 14 species or clades
  • All old entries removed manually
  pr2_taxo_add <- pr2_update_taxo_compared %>% filter(is.na(kingdom.y)) %>%
                            select(kingdom:species) 
  pr2_taxo_add <- pr2_taxo_update(pr2_taxo_add, method = "add")
  
  # Check which lines did not work (if works query_result = 1)
  filter(pr2_taxo_add, query_result==0)

5.4 Recheck the whole taxonomy table

  pr2_taxo <- get_query(db_pr2, "select * from taxo") 
  pr2_taxo  <- pr2_taxo %>% filter (is.na(taxo_removed_version))
  pr2_taxo_check (select(pr2_taxo, kingdom:species), dir_pr2_update)

6 Old sequences

  • 872 sequences updated
  # Extract lines with accession number already in PR2, then one need just to update the species, nothing more 
    pr2_update_old <- pr2_update %>% filter(genbank_accession %in% pr2$genbank_accession)
    pr2_update_old <- pr2_sequence_reassign(pr2_update_old)

7 New sequences

7.1 Extract lines with new accession number

  • 8 sequences
    pr2_update_new <- pr2_update %>% filter(!(genbank_accession %in% pr2$genbank_accession))
    write_tsv(pr2_update_new, str_c(dir_pr2_update, "/pr2_update_new.txt"), na="")

7.2 Download the new sequences and parse them

    pr2_update_new_metadata <- genbank_download_parse(pr2_update_new$genbank_accession, pr2.env$genbank_directory)
    write_tsv(pr2_update_new_metadata, str_c(dir_pr2_update,"/pr2_update_new_metadata.txt"), na="")
    head(pr2_update_new_metadata, 10)

7.3 Extract features

Features that can be used to determine the start and end of the 18S - This is stored in a file.

    pr2_update_new_features <- genbank_features(pr2_update_new$genbank_accession, pr2.env$genbank_directory)
    write_tsv(pr2_update_new_features,file_pr2_update_new_features, na = "")  

7.4 Update sequence_start and sequence_end from feature table

Enter the start and end of the sequence into the pr2_update_new Excel sheet and reload Note : Not done because no sequence with ITS region

#    pr2_update_new <- read_excel(file_pr2_update, sheet="pr2_update_new", col_names = TRUE, guess_max=200000)

7.5 Merge the accession, taxonomy, sequence and metadata

    pr2_update_new <-left_join(select(pr2_update_new, genbank_accession, start, end, species), pr2_update_new_metadata)
   
  # Remove sequences which are not yet in Genbank
    pr2_update_new <- pr2_update_new %>% filter(!is.na(gb_definition))

7.6 Extract sequences and get length and number of ambiguities

3 sequences are too short - 5 sequences remaining

  # Compute the start and end of sequence if not uploaded from the Excel file
   pr2_update_new <- pr2_update_new %>% 
      mutate(start = case_when(is.na(start) ~ 1 ,
                                        TRUE ~ start),
             end = case_when(is.na(end) ~ sequence_length ,
                                        TRUE ~ as.integer(end))) %>% 
             mutate (sequence = str_sub(sequence, start, end),
                     sequence_length_old = sequence_length)
   

  # Update sequence length and ambiguities for all sequences since some may have been shortened
    pr2_update_new$sequence_length <- str_length(pr2_update_new$sequence) 
    pr2_update_new$ambiguities <- str_count(pr2_update_new$sequence,pattern="[NRYSWKMBDHV]")

  # Remove sequences that are shorter than minimum lenght (500) and those that have too many ambiguities
    pr2_update_new_rejected <- pr2_update_new %>% 
      filter((sequence_length < pr2.env$sequence_length_min)|(ambiguities > pr2.env$ambiguities_max))
    write_tsv( pr2_update_new_rejected, str_c(dir_pr2_update,"/pr2_update_new_rejected.txt"), na = "")
    pr2_update_new_rejected$genbank_accession
    
    pr2_update_new <- pr2_update_new %>% filter((sequence_length >= pr2.env$sequence_length_min) & 
                                                (ambiguities <= pr2.env$ambiguities_max))

7.7 Create PR2 fields

  #   pr2_update_new <- pr2_update_new %>% 
  # dplyr::rename(start=sequence_start, end=sequence_end)
    pr2_update_new <- pr2_update_new %>% 
        mutate (label = "U", 
                added_version = pr2.env$version, 
                edited_by = pr2.env$editor,
                pr2_accession = str_c(genbank_accession,".",start,".", end,"_",label))

  # Add reference_sequence = 1 if the sequence are reference sequences
  
  # Unquote the following three lines if Chimera or better write an "if statement" if species = Chimera_XXXX
  # pr2_update_new$removed_version <- pr2_update_new$added_version
  # pr2_update_new$chimera <- 1
  # pr2_update_new$chimera_remark <- "Detected by B. Edvardsen by UCHIME or long branch/manual"

    write_tsv(pr2_update_new,str_c(dir_pr2_update, "/pr2_update_new.txt"))

7.8 Create the different tables

    pr2_main_update <- select(pr2_update_new, pr2_accession, genbank_accession, start,end, label, species,  
                                     starts_with("added_"),starts_with("removed_"), starts_with("chimera"))
    pr2_sequences_update <- select(pr2_update_new, pr2_accession, sequence, sequence_length, ambiguities)
    pr2_metadata_update <- select(pr2_update_new,genbank_accession, starts_with("gb_"),starts_with("pr2_"), starts_with("eukref_")) 
    pr2_metadata_update <- select(pr2_metadata_update, -pr2_accession)

7.9 Write to database

The update can be done either : * directly (but depends on Internet connection) - write_db * by exporting text file and then reimporting using the text file directly . The advantage of this method is that the data can be edited.

    write_tsv(pr2_main_update,str_c(dir_pr2_update, "/pr2_main_update.txt"), na="")
    write_tsv(pr2_sequences_update,str_c(dir_pr2_update, "/pr2_sequences_update.txt"), na="")
    write_tsv(pr2_metadata_update,str_c(dir_pr2_update, "/pr2_metadata_update.txt"), na="")

    db_append_records (pr2_db, table_name = "pr2_main", table_df = pr2_main_update )
    db_append_records (pr2_db, table_name = "pr2_sequences", table_df = pr2_sequences_update )
    db_append_records (pr2_db, table_name = "pr2_metadata", table_df = pr2_metadata_update )

8 Reference sequences

8.1 Find the references sequences

  • 45 sequences
 pr2_update_reference <- pr2_update %>% filter(!is.na(reference_sequence))

8.2 Build query to upgrade the fields

  pr2_update_reference <- pr2_update_reference %>% 
    mutate(query = str_c("UPDATE pr2_main",
                         " SET reference_sequence = ", db_sql_escape(reference_sequence),
                         " WHERE genbank_accession = ",db_sql_escape(genbank_accession)))

#Execute query
  pr2_update_reference$query_result <- db_execute_query_vector(pr2_db,pr2_update_reference$query)

Daniel Vaulot

26 10 2018