PR2 version 4.13.0
Thalassiosirales from L. Arsenieff

1 Init
2 Read pr2
3 Reference
4 Set up the files
5 Read the original data and reformat
- 5.1 Read the data
- 5.2 Compare with sequences in PR2
6 Taxonomy
- 6.1 Build and check
- 6.2 Find taxa in PR2 that are not included in the update
7 New sequences
8 Finalization
9 Save everything to an Excel file

1 Init

Load the variables common to the different scripts and the necessary libraries

  knitr::opts_chunk$set(eval=TRUE, 
                        cache = TRUE, 
                        cache.extra = file.info("../updates/2019 Arsenieff diatoms/pr2_diatoms_2019_11_21_LA.xlsx"),
                        tidy = FALSE)

  source('PR2_init.R', echo=FALSE)

2 Read pr2

Use the PR2 from Google database

# Info for the local dabase
pr2_db <- db_info("pr2_google")

pr2_db_con <- db_connect(pr2_db)
pr2_main <- tbl(pr2_db_con, "pr2_main") %>% collect()
# pr2_main <- pr2_main %>% filter (is.na(removed_version))

pr2_seq <- tbl(pr2_db_con, "pr2_sequences") %>% collect()

pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy")
pr2_taxo <- pr2_taxo %>% filter(is.na(taxo_removed_version)) %>% collect()

pr2_metadata <- tbl(pr2_db_con, "pr2_metadata") %>% collect()

# Join the tables and keep only sequences that are not removed

pr2 <- left_join(pr2_main, pr2_taxo, by = c(species = "species"))
pr2 <- left_join(pr2, pr2_seq)
pr2 <- left_join(pr2, pr2_metadata)
db_disconnect(pr2_db_con)

[1] TRUE

pr2_18S <- pr2 %>% filter(is.na(removed_version)) %>% filter(gene == "18S_rRNA")
pr2_18S_removed <- pr2 %>% filter(!is.na(removed_version)) %>% filter(gene == "18S_rRNA")

# Remove pr2 to make sure that we do not use the full pr2
remove(pr2)

3 Reference

Arsenieff L., Le Gall F., Rigaut-Jalabert F., Mahé F., Sarno D., Gouhier L., Baudoux A-C., Simon N. 2020. Diversity and dynamics of relevant nanoplanktonic diatoms in the Western English Channel. The ISME Journal. DOI: 10.1038/s41396-020-0659-6.

4 Set up the files

target_group = c("Thalassiosira", "Minidiscus", "Guinardia")
target_level = "genus"


dir_pr2_update <- "../updates/2019 Arsenieff diatoms"

pr2.env$editor <- "L. Arsenieff"

full_path <- function(file_name) {
    str_c(dir_pr2_update, "/", file_name)
}

file_pr2_update_excel <- full_path("pr2_diatoms_2019_11_21_LA.xlsx")

# create the directory for taxonomy output
dir.create(full_path("taxo"), showWarnings = FALSE)

5 Read the original data and reformat

5.1 Read the data

Number of sequences = 274

pr2_update <- read_excel(file_pr2_update_excel, sheet = "updates", guess_max = 2e+05, 
    na = c("", "-"))

str_c("Number of sequences : ", nrow(pr2_update))

[1] "Number of sequences : 274"

5.2 Compare with sequences in PR2

str_c("Number of sequences of target group in pr2 : ", nrow(filter(pr2_18S, !!as.symbol(target_level) %in% 
    target_group)))

[1] "Number of sequences of target group in pr2 : 255"

str_c("Number of updated sequences that are not present in PR2 : ", nrow(filter(pr2_update, 
    !(genbank_accession %in% pr2_18S$genbank_accession))))

[1] "Number of updated sequences that are not present in PR2 : 19"

str_c("Number of updated sequences that are not active in PR2 : ", nrow(filter(pr2_update, 
    (genbank_accession %in% pr2_18S_removed$genbank_accession))))

[1] "Number of updated sequences that are not active in PR2 : 2"

str_c("Sequences of PR2 that are misclassified - df ")

[1] "Sequences of PR2 that are misclassified - df "

filter(pr2_18S, (genbank_accession %in% pr2_update$genbank_accession)) %>% count(genus)

# A tibble: 3 x 2
  genus             n
  <chr>         <int>
1 Guinardia        13
2 Minidiscus       13
3 Thalassiosira   229

str_c("Sequences from target group in PR2 that are not in update  - df ")

[1] "Sequences from target group in PR2 that are not in update  - df "

filter(pr2_18S, (!!as.symbol(target_level) %in% target_group) & !(genbank_accession %in% 
    pr2_update$genbank_accession))  # %>%

# A tibble: 0 x 97
# ... with 97 variables: pr2_main_id <int>, pr2_accession <chr>,
#   genbank_accession <chr>, start <dbl>, end <dbl>, label <chr>, gene <chr>,
#   organelle <chr>, species <chr>, chimera <int>, chimera_remark <chr>,
#   reference_sequence <int>, added_version <chr>, removed_version <chr>,
#   edited_version <chr>, edited_by <chr>, edited_remark <chr>, remark <chr>,
#   taxo_id <int>, kingdom <chr>, supergroup <chr>, division <chr>,
#   class <chr>, order <chr>, family <chr>, genus <chr>,
#   taxon_trophic_mode <chr>, taxo_edited_version <chr>, taxo_edited_by <chr>,
#   taxo_removed_version <chr>, taxo_remark <chr>, reference <chr>,
#   seq_id <int>, sequence <chr>, sequence_length <int>, ambiguities <int>,
#   sequence_hash <chr>, gb_sequence <chr>, pr2_metadata_id <int>,
#   gb_date <chr>, gb_locus <chr>, gb_definition <chr>, gb_organism <chr>,
#   gb_organelle <chr>, gb_taxonomy <chr>, gb_strain <chr>,
#   gb_culture_collection <chr>, gb_clone <chr>, gb_isolate <chr>,
#   gb_isolation_source <chr>, gb_specimen_voucher <chr>, gb_host <chr>,
#   gb_collection_date <chr>, gb_environmental_sample <chr>, gb_country <chr>,
#   gb_lat_lon <chr>, gb_collected_by <chr>, gb_note <chr>,
#   gb_references <chr>, gb_publication <chr>, gb_authors <chr>,
#   gb_journal <chr>, pubmed_id <int>, eukref_name <chr>, eukref_source <chr>,
#   eukref_env_material <chr>, eukref_env_biome <chr>,
#   eukref_biotic_relationship <chr>, eukref_specific_host <chr>,
#   eukref_geo_loc_name <chr>, eukref_notes <chr>, pr2_sample_type <chr>,
#   pr2_sample_method <chr>, pr2_latitude <dbl>, pr2_longitude <dbl>,
#   pr2_ocean <chr>, pr2_sea <chr>, pr2_sea_lat <dbl>, pr2_sea_lon <dbl>,
#   pr2_continent <chr>, pr2_country <chr>, pr2_location <chr>,
#   pr2_location_geoname <chr>, pr2_location_geotype <chr>,
#   pr2_location_lat <dbl>, pr2_location_lon <dbl>, pr2_country_geocode <chr>,
#   pr2_country_lat <dbl>, pr2_country_lon <dbl>, pr2_sequence_origin <chr>,
#   pr2_size_fraction <chr>, pr2_size_fraction_min <dbl>,
#   pr2_size_fraction_max <dbl>, metadata_remark <chr>, junk_gb_authors <chr>,
#   junk_gb_publication <chr>, junk_gb_journal <chr>

# write_tsv(full_path('api_pr2_not_javier.tsv'), na='')

str_c("Number of PR2 discarded sequences in update : ", nrow(filter(pr2_18S_removed, 
    genbank_accession %in% pr2_update$genbank_accession)))

[1] "Number of PR2 discarded sequences in update : 2"

str_c("Sequences duplicated  - df ")

[1] "Sequences duplicated  - df "

pr2_update %>% count(genbank_accession) %>% filter(n > 1)

# A tibble: 0 x 2
# ... with 2 variables: genbank_accession <chr>, n <int>

str_c("Sequences updated with 2 entries in PR2  (e.g. with and without introns) - df ")

[1] "Sequences updated with 2 entries in PR2  (e.g. with and without introns) - df "

pr2_duplicated <- left_join(select(pr2_update, genbank_accession), select(pr2_18S, 
    genbank_accession, pr2_accession)) %>% count(genbank_accession)

6 Taxonomy

6.1 Build and check

pr2_taxo_updated <- pr2_update %>% group_by_at(pr2.env$taxo_levels) %>% count()

pr2_taxo_check(pr2_taxo_updated, full_path("taxo"))

pr2_taxo_updated <- pr2_taxo_updated %>% rename_all(~str_c(., "_new")) %>% rename(species = species_new) %>% 
    left_join(pr2_taxo) %>% rename_at(pr2.env$taxo_levels, ~str_c(., "_old")) %>% 
    rename_all(~str_replace(., "_new", "")) %>% rename(species = species_old)

write.xlsx(pr2_taxo_updated, full_path("taxo_updated.xlsx"))

6.2 Find taxa in PR2 that are not included in the update

pr2_taxo_not_updated <- pr2_taxo %>% filter(!!as.symbol(target_level) %in% target_group) %>% 
    filter(!(species %in% pr2_taxo_updated$species))

write.xlsx(pr2_taxo_not_updated, full_path("taxo_not_updated.xlsx"))

7 New sequences

7.1 List of new sequences

17 new sequences

pr2_new <- pr2_update %>% filter(!(genbank_accession %in% pr2$genbank_accession)) %>% 
    select(genbank_accession:species, sequence)

pr2_genbank_new <- pr2_new %>% select(genbank_accession)

7.2 Run the script `"`PR2 genbank dowmnload.R` in background

7.3 Read the metadata files and join by genbank accession

metafile <- list()

for (i in c(1:2)) {
    metafile[[i]] <- read_tsv(full_path(str_c("pr2_gb_metadata_", i, ".txt")), col_type = cols(gb_collection_date = col_character())) %>% 
        mutate(gb_isolate = as.character(gb_isolate))
}
# The first file serves as a template for the following one when binding rows
gb_metadata <- reduce(metafile, bind_rows) %>% distinct() %>% rename(gb_sequence = sequence)

pr2_new <- left_join(pr2_new, gb_metadata, by = c(genbank_accession = "genbank_accession"))

7.4 Read the features file and join

gb_features <- read_tsv(full_path("pr2_gb_features.txt")) %>% filter(product == "18S ribosomal RNA") %>% 
    rename(start_feature = start, end_feature = end) %>% distinct()

pr2_new <- left_join(pr2_new, gb_features, by = c(genbank_accession = "genbank_accession"))

write.xlsx(pr2_new, full_path("pr2_new.xlsx"), na = "")

8 Finalization

8.1 Taxo

pr2_taxo_updated <- read_excel(full_path("taxo_updated_corrected.xlsx"), guess_max = 2e+05, 
    na = c("", "-"))
# pr2_taxo_remove <- read_excel(file_pr2_update_excel, sheet = 'taxo_to_remove',
# guess_max=200000, na=c('', '-'))

8.2 Old sequences

pr2_update_old <- pr2_update %>% filter((genbank_accession %in% pr2$genbank_accession))

pr2_species_old <- pr2_18S %>% select(genbank_accession, species) %>% rename(species_old = species)

pr2_main_old <- pr2_update_old %>% select(genbank_accession, species, chimera, edited_remark, 
    removed_version) %>% mutate(edited_version = pr2.env$version, edited_by = pr2.env$editor) %>% 
    left_join(pr2_species_old) %>% filter((species != species_old) | (chimera == 
    1))  # only keep sequences for which species has been changed or labelled as chimera
nrow(pr2_main_old)

[1] 2

8.3 New sequences

8.3.1 Read new sequences once the corrections have been done

pr2_new <- read_excel(full_path("pr2_new_corrected.xlsx"), na = "") %>% filter(!is.na(gb_definition))

8.3.2 Add fields

pr2_new <- pr2_new %>% # mutate(start=1, end=sequence_length, label = case_when (intron == 'intron' ~
# 'G', TRUE ~'U')) %>%
mutate(sequence = gb_sequence, sequence_length = str_length(sequence), label = "U", 
    pr2_accession = str_c(genbank_accession, ".", start, ".", end, "_", label), reference_sequence = NA, 
    gene = "18S rRNA", organelle = "nucleus", ambiguities = str_count(sequence, pattern = pr2.env$ambig_regex))

pr2_new$sequence_hash = purrr::map_chr(pr2_new$sequence, digest::sha1)

8.3.3 Remove sequences with ambiguities or too short

filter(pr2_new, (sequence_length < pr2.env$sequence_length_min) | (ambiguities > 
    pr2.env$ambiguities_max)) %>% arrange(desc(sequence_length)) %>% select(pr2_accession, 
    class, genus, sequence_length, ambiguities, species, gb_definition)

# A tibble: 0 x 7
# ... with 7 variables: pr2_accession <chr>, class <chr>, genus <chr>,
#   sequence_length <int>, ambiguities <int>, species <chr>,
#   gb_definition <chr>

pr2_new <- pr2_new %>% filter((sequence_length >= pr2.env$sequence_length_min) & 
    (ambiguities <= pr2.env$ambiguities_max))

8.3.4 Check duplicate entries

pr2_new %>% group_by(pr2_accession) %>% summarise(n_seq = n()) %>% filter(n_seq > 
    1)

# A tibble: 0 x 2
# ... with 2 variables: pr2_accession <chr>, n_seq <int>

8.3.5 Final files for uploading

sprintf(" Final number of sequences added:  %d", nrow(pr2_new))

[1] " Final number of sequences added:  17"

colnames(pr2_new)

 [1] "genbank_accession"       "start"                  
 [3] "end"                     "label"                  
 [5] "gene"                    "organelle"              
 [7] "kingdom"                 "supergroup"             
 [9] "division"                "class"                  
[11] "order"                   "family"                 
[13] "genus"                   "species"                
[15] "sequence"                "gb_definition"          
[17] "gb_organism"             "gb_organelle"           
[19] "gb_strain"               "gb_isolate"             
[21] "gb_clone"                "gb_specimen_voucher"    
[23] "gb_collected_by"         "gb_lat_lon"             
[25] "gb_note"                 "gb_culture_collection"  
[27] "gb_isolation_source"     "gb_host"                
[29] "gb_environmental_sample" "gb_collection_date"     
[31] "gb_country"              "gb_date"                
[33] "gb_locus"                "gb_sequence"            
[35] "pr2_sample_type"         "sequence_length"        
[37] "pr2_longitude"           "pr2_latitude"           
[39] "seqnames"                "start.y"                
[41] "end.y"                   "width"                  
[43] "strand"                  "type"                   
[45] "product"                 "loctype"                
[47] "pr2_accession"           "reference_sequence"     
[49] "ambiguities"             "sequence_hash"

pr2_update_new <- pr2_new

pr2_main_new <- pr2_update_new %>% select(pr2_accession, genbank_accession, start, 
    end, label, species, organelle, gene) %>% mutate(added_version = pr2.env$version, 
    edited_by = pr2.env$editor)
nrow(pr2_main_new)

[1] 17

pr2_sequences_new <- pr2_update_new %>% select(pr2_accession, sequence, gb_sequence, 
    sequence_length, ambiguities, sequence_hash)
nrow(pr2_sequences_new)

[1] 17

pr2_metadata_new <- pr2_update_new %>% select(genbank_accession, starts_with("gb_"), 
    starts_with("pr2_"), starts_with("eukref_")) %>% select(-pr2_accession, -gb_sequence) %>% 
    distinct()
nrow(pr2_metadata_new)

[1] 17

# Write to files that are then uploaded into the datbase (easier for error
# tracking)

# write_tsv(pr2_main_add,full_path('pr2_add_main.txt'), na='')
# write_tsv(pr2_sequences_add,full_path('pr2_add_sequences.txt'), na='')
# write_tsv(pr2_metadata_add,full_path('pr2_add_metadata.txt'), na='')

9 Save everything to an Excel file

  file_pr2_imports <-  full_path("pr2_imports_final.xlsx")
 onglets <- list("pr2_taxo_updated" = pr2_taxo_updated,
                 # "pr2_taxo_remove" = pr2_taxo_remove,
                 "pr2_main_old" = pr2_main_old, 
                 # "pr2_metadata_old" = pr2_metadata_old,
                 "pr2_main_new" = pr2_main_new, 
                 "pr2_sequence_new" = pr2_sequences_new,
                 "pr2_metadata_new" = pr2_metadata_new)
 write.xlsx(onglets, file_pr2_imports)

PR2 version 4.13.0 Thalassiosirales from L. Arsenieff

PR2 version 4.13.0 Thalassiosirales from L. Arsenieff

1 Init

2 Read pr2

3 Reference

4 Set up the files

5 Read the original data and reformat

5.1 Read the data

5.2 Compare with sequences in PR2

6 Taxonomy

6.1 Build and check

6.2 Find taxa in PR2 that are not included in the update

7 New sequences

7.1 List of new sequences

7.2 Run the script "PR2 genbank dowmnload.R` in background

7.3 Read the metadata files and join by genbank accession

7.4 Read the features file and join

8 Finalization

8.1 Taxo

8.2 Old sequences

8.3 New sequences

8.3.1 Read new sequences once the corrections have been done

8.3.2 Add fields

8.3.3 Remove sequences with ambiguities or too short

8.3.4 Check duplicate entries

8.3.5 Final files for uploading

9 Save everything to an Excel file

PR2 version 4.13.0
Thalassiosirales from L. Arsenieff

PR2 version 4.13.0
Thalassiosirales from L. Arsenieff

7.2 Run the script `"`PR2 genbank dowmnload.R` in background