PR2 version 4.11.0
Chlorophyta - Mamiellophyceae

1 Main changes
2 References used
3 Read pr2
4 Mamiellophyceae
5 Construct taxonomy and compare to existing taxonomy
6 Old sequences
7 New sequences
8 Reference sequences
- 8.1 Find the references sequences
- 8.2 Build query to upgrade the fields

By default do not run any of the chunks when knitting

  knitr::opts_chunk$set(eval=FALSE)

Load the variables common to the different scripts and the necessary libraries

  source('PR2_init.R', echo=FALSE)

1 Main changes

Version 4.11.0

Update Mamiellophyceae taxonomy

2 References used

Tragin, M. & Vaulot, D. 2018. Novel diversity within marine Mamiellophyceae (Chlorophyta) unveiled by metabarcoding. bioRxiv.

3 Read pr2

Note : Now use the local pr2 database to avoid connection problems

# Info for the local dabase 
  pr2_db <- db_info("pr2_local")

  pr2_db_con <- db_connect(pr2_db)
  pr2_main <- tbl(pr2_db_con, "pr2_main") %>% collect()
  # pr2_main  <- pr2_main %>% filter (is.na(removed_version))

  pr2_seq <- tbl(pr2_db_con, "pr2_sequences")%>% collect()

  pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy")
  pr2_taxo  <- pr2_taxo %>% filter (is.na(taxo_removed_version))%>% collect()

  pr2_metadata <- tbl(pr2_db_con, "pr2_metadata")%>% collect()

# Join the tables and keep only sequences that are not removed

  pr2 <- left_join(pr2_main, pr2_taxo, by = c("species"="species"))
  pr2 <- left_join (pr2, pr2_seq)
  pr2 <- left_join (pr2, pr2_metadata)
  db_disconnect(pr2_db_con)
  
  pr2  <- pr2 %>% filter (is.na(removed_version))

4 Mamiellophyceae

4.1 Define annotator

  pr2.env$editor = "Vaulot D."

4.2 Define the file names

    dir_pr2_update <- "../updates/2018 Mamiello/"
    add_dir <- function(file_name) str_c(dir_pr2_update, file_name)
    

    file_pr2_update <- add_dir("PR2 Mamiello.xlsx")
    file_pr2_update_taxo <- add_dir("taxo.txt")
    file_pr2_update_new_features <- add_dir("pr2_new_sequences_features.txt")
    file_pr2_not_updated <- add_dir("pr2_sequences_not_updated.txt")
    file_pr2_update_new <- add_dir("pr2_update_new.txt")

4.3 Export Existing sequences

  pr2_selected <- pr2 %>% 
    filter(genus %in% c("Micromonas", "Ostreococcus", "Mantoniella"))%>% 
    arrange(species)
  pr2_selected_excel <- pr2_selected %>% 
    select(pr2_accession, genbank_accession, start, end, kingdom:genus, species,gb_strain, gb_clone) 
  write_tsv(pr2_selected_excel, str_c(dir_pr2_update, "/pr2_mamiello.txt"), na="")
  pr2_export(pr2_selected, str_c(dir_pr2_update, "/pr2_mamiello.fasta.gz"), file_type = "fasta" , file_format = "fasta_taxo_short")
  pr2_export(pr2_selected, str_c(dir_pr2_update, "/pr2_mamiello_dada2.fasta.gz"), file_type = "fasta" , file_format = "dada2")

4.4 Assign the sequences

Strategy

Principle: Use the Wang assigner as implementerd in dada2
Use as reference the alignement with sequences and barcodes from Margot OSD paper
It is necessary to edit the sequences names to be compatible with dada2 >species_name (do not forget ; at the end)
The pr2 sequences are aligned with the refernce sequence and the V4 region is extracted.
Run the dada2 routine for assignement
Check the sequences which have only a part of the V4 region, in general they cannot be assigned
Any sequence for which the bootstrap value is below 60 should be checked manually

Note : the extraction of the V4 region could be done using primers (could be a problem for sequences starting after V4 region)

  dada2_assign(add_dir("mantoniella_pr2_V4.fasta"), ref_file_name = add_dir("mantoniella_ref.fasta"),tax_levels = c("genus","species"))
  dada2_assign(add_dir("ostreococcus_pr2_V4.fasta"), ref_file_name = add_dir("ostreococcus_ref.fasta"),tax_levels = c("species"))
  dada2_assign(add_dir("micromonas_pr2_V4.fasta"), ref_file_name = add_dir("micromonas_ref.fasta"),tax_levels = c("species"))

4.5 Read the updated Excel file - Need to put guess_max at the max

  pr2_update <- read_excel(file_pr2_update, sheet="pr2_update", col_names = TRUE, guess_max=200000)
  
  # Only keep the first columns
  pr2_update <- pr2_update %>%  select(pr2_accession:genus)

5 Construct taxonomy and compare to existing taxonomy

5.1 Construct a taxonomy file to import into Excel (taxo.txt file)

Check manually in the database what needs to be updated

  # Construct the taxonomy file  
    pr2_update_taxo <- pr2_update %>% 
      group_by_(.dots=pr2.env$taxo_levels) %>% 
      summarise(sequence_number = n()) %>% 
      ungroup()

  #  Check it for duplicate names and duplicate parents
    pr2_taxo_check (select(pr2_update_taxo, kingdom:species), dir_pr2_update)
  
  # Compare to existing taxonomy
    pr2_update_taxo_compared <- left_join (pr2_update_taxo, pr2_taxo, by = c("species"="species")) %>%
                                 rename_ ("kingdom"="kingdom.x", "supergroup"="supergroup.x", "division" = "division.x",
                                          "class" = "class.x", "order" = "order.x", "family"="family.x", "genus" = "genus.x") %>%
                                  arrange(species)
    
  # Write to a file to examine with Excel  
    write_tsv(pr2_update_taxo_compared, file_pr2_update_taxo, na = "")

5.2 Update the entries that are already in the taxo table

1 species
These entries have stuff in the taxo.y columns
Note that the version and the editors have to be changed in the dv_function_pr2 file
Note that the function update other species belonging to the same genera so that they have the same upstream taxonomy

  pr2_taxo_update <- pr2_update_taxo_compared %>% filter(!(is.na(kingdom.y))) %>%
                            select(kingdom:species) 
  pr2_taxo_update <- pr2_taxo_update(pr2_taxo_update, method = "update")
  
  # Check which lines did not work
  filter( pr2_taxo_update, query_result==0)

5.3 Append the new entries

14 species or clades
All old entries removed manually

  pr2_taxo_add <- pr2_update_taxo_compared %>% filter(is.na(kingdom.y)) %>%
                            select(kingdom:species) 
  pr2_taxo_add <- pr2_taxo_update(pr2_taxo_add, method = "add")
  
  # Check which lines did not work (if works query_result = 1)
  filter(pr2_taxo_add, query_result==0)

5.4 Recheck the whole taxonomy table

  pr2_taxo <- get_query(db_pr2, "select * from taxo") 
  pr2_taxo  <- pr2_taxo %>% filter (is.na(taxo_removed_version))
  pr2_taxo_check (select(pr2_taxo, kingdom:species), dir_pr2_update)

6 Old sequences

872 sequences updated

  # Extract lines with accession number already in PR2, then one need just to update the species, nothing more 
    pr2_update_old <- pr2_update %>% filter(genbank_accession %in% pr2$genbank_accession)
    pr2_update_old <- pr2_sequence_reassign(pr2_update_old)

7 New sequences

7.1 Extract lines with new accession number

8 sequences

    pr2_update_new <- pr2_update %>% filter(!(genbank_accession %in% pr2$genbank_accession))
    write_tsv(pr2_update_new, str_c(dir_pr2_update, "/pr2_update_new.txt"), na="")

7.2 Download the new sequences and parse them

    pr2_update_new_metadata <- genbank_download_parse(pr2_update_new$genbank_accession, pr2.env$genbank_directory)
    write_tsv(pr2_update_new_metadata, str_c(dir_pr2_update,"/pr2_update_new_metadata.txt"), na="")
    head(pr2_update_new_metadata, 10)

7.3 Extract features

Features that can be used to determine the start and end of the 18S - This is stored in a file.

    pr2_update_new_features <- genbank_features(pr2_update_new$genbank_accession, pr2.env$genbank_directory)
    write_tsv(pr2_update_new_features,file_pr2_update_new_features, na = "")

7.4 Update sequence_start and sequence_end from feature table

Enter the start and end of the sequence into the pr2_update_new Excel sheet and reload Note : Not done because no sequence with ITS region

#    pr2_update_new <- read_excel(file_pr2_update, sheet="pr2_update_new", col_names = TRUE, guess_max=200000)

7.5 Merge the accession, taxonomy, sequence and metadata

    pr2_update_new <-left_join(select(pr2_update_new, genbank_accession, start, end, species), pr2_update_new_metadata)
   
  # Remove sequences which are not yet in Genbank
    pr2_update_new <- pr2_update_new %>% filter(!is.na(gb_definition))

7.6 Extract sequences and get length and number of ambiguities

3 sequences are too short - 5 sequences remaining

  # Compute the start and end of sequence if not uploaded from the Excel file
   pr2_update_new <- pr2_update_new %>% 
      mutate(start = case_when(is.na(start) ~ 1 ,
                                        TRUE ~ start),
             end = case_when(is.na(end) ~ sequence_length ,
                                        TRUE ~ as.integer(end))) %>% 
             mutate (sequence = str_sub(sequence, start, end),
                     sequence_length_old = sequence_length)
   

  # Update sequence length and ambiguities for all sequences since some may have been shortened
    pr2_update_new$sequence_length <- str_length(pr2_update_new$sequence) 
    pr2_update_new$ambiguities <- str_count(pr2_update_new$sequence,pattern="[NRYSWKMBDHV]")

  # Remove sequences that are shorter than minimum lenght (500) and those that have too many ambiguities
    pr2_update_new_rejected <- pr2_update_new %>% 
      filter((sequence_length < pr2.env$sequence_length_min)|(ambiguities > pr2.env$ambiguities_max))
    write_tsv( pr2_update_new_rejected, str_c(dir_pr2_update,"/pr2_update_new_rejected.txt"), na = "")
    pr2_update_new_rejected$genbank_accession
    
    pr2_update_new <- pr2_update_new %>% filter((sequence_length >= pr2.env$sequence_length_min) & 
                                                (ambiguities <= pr2.env$ambiguities_max))

7.7 Create PR2 fields

  #   pr2_update_new <- pr2_update_new %>% 
  # dplyr::rename(start=sequence_start, end=sequence_end)
    pr2_update_new <- pr2_update_new %>% 
        mutate (label = "U", 
                added_version = pr2.env$version, 
                edited_by = pr2.env$editor,
                pr2_accession = str_c(genbank_accession,".",start,".", end,"_",label))

  # Add reference_sequence = 1 if the sequence are reference sequences
  
  # Unquote the following three lines if Chimera or better write an "if statement" if species = Chimera_XXXX
  # pr2_update_new$removed_version <- pr2_update_new$added_version
  # pr2_update_new$chimera <- 1
  # pr2_update_new$chimera_remark <- "Detected by B. Edvardsen by UCHIME or long branch/manual"

    write_tsv(pr2_update_new,str_c(dir_pr2_update, "/pr2_update_new.txt"))

7.8 Create the different tables

    pr2_main_update <- select(pr2_update_new, pr2_accession, genbank_accession, start,end, label, species,  
                                     starts_with("added_"),starts_with("removed_"), starts_with("chimera"))
    pr2_sequences_update <- select(pr2_update_new, pr2_accession, sequence, sequence_length, ambiguities)
    pr2_metadata_update <- select(pr2_update_new,genbank_accession, starts_with("gb_"),starts_with("pr2_"), starts_with("eukref_")) 
    pr2_metadata_update <- select(pr2_metadata_update, -pr2_accession)

7.9 Write to database

The update can be done either : * directly (but depends on Internet connection) - write_db * by exporting text file and then reimporting using the text file directly . The advantage of this method is that the data can be edited.

    write_tsv(pr2_main_update,str_c(dir_pr2_update, "/pr2_main_update.txt"), na="")
    write_tsv(pr2_sequences_update,str_c(dir_pr2_update, "/pr2_sequences_update.txt"), na="")
    write_tsv(pr2_metadata_update,str_c(dir_pr2_update, "/pr2_metadata_update.txt"), na="")

    db_append_records (pr2_db, table_name = "pr2_main", table_df = pr2_main_update )
    db_append_records (pr2_db, table_name = "pr2_sequences", table_df = pr2_sequences_update )
    db_append_records (pr2_db, table_name = "pr2_metadata", table_df = pr2_metadata_update )

8 Reference sequences

8.1 Find the references sequences

45 sequences

 pr2_update_reference <- pr2_update %>% filter(!is.na(reference_sequence))

8.2 Build query to upgrade the fields

  pr2_update_reference <- pr2_update_reference %>% 
    mutate(query = str_c("UPDATE pr2_main",
                         " SET reference_sequence = ", db_sql_escape(reference_sequence),
                         " WHERE genbank_accession = ",db_sql_escape(genbank_accession)))

#Execute query
  pr2_update_reference$query_result <- db_execute_query_vector(pr2_db,pr2_update_reference$query)

PR2 version 4.11.0 Chlorophyta - Mamiellophyceae

PR2 version 4.11.0 Chlorophyta - Mamiellophyceae

1 Main changes

2 References used

3 Read pr2

4 Mamiellophyceae

4.1 Define annotator

4.2 Define the file names

4.3 Export Existing sequences

4.4 Assign the sequences

4.5 Read the updated Excel file - Need to put guess_max at the max

5 Construct taxonomy and compare to existing taxonomy

5.1 Construct a taxonomy file to import into Excel (taxo.txt file)

5.2 Update the entries that are already in the taxo table

5.3 Append the new entries

5.4 Recheck the whole taxonomy table

6 Old sequences

7 New sequences

7.1 Extract lines with new accession number

7.2 Download the new sequences and parse them

7.3 Extract features

7.4 Update sequence_start and sequence_end from feature table

7.5 Merge the accession, taxonomy, sequence and metadata

7.6 Extract sequences and get length and number of ambiguities

7.7 Create PR2 fields

7.8 Create the different tables

7.9 Write to database

8 Reference sequences

8.1 Find the references sequences

8.2 Build query to upgrade the fields

PR2 version 4.11.0
Chlorophyta - Mamiellophyceae

PR2 version 4.11.0
Chlorophyta - Mamiellophyceae