PR2 version 4.13.0
Thalassiosirales from L. Arsenieff
PR2 version 4.13.0
Thalassiosirales from L. Arsenieff
1 Init
Load the variables common to the different scripts and the necessary libraries
2 Read pr2
- Use the PR2 from Google database
# Info for the local dabase
pr2_db <- db_info("pr2_google")
pr2_db_con <- db_connect(pr2_db)
pr2_main <- tbl(pr2_db_con, "pr2_main") %>% collect()
# pr2_main <- pr2_main %>% filter (is.na(removed_version))
pr2_seq <- tbl(pr2_db_con, "pr2_sequences") %>% collect()
pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy")
pr2_taxo <- pr2_taxo %>% filter(is.na(taxo_removed_version)) %>% collect()
pr2_metadata <- tbl(pr2_db_con, "pr2_metadata") %>% collect()
# Join the tables and keep only sequences that are not removed
pr2 <- left_join(pr2_main, pr2_taxo, by = c(species = "species"))
pr2 <- left_join(pr2, pr2_seq)
pr2 <- left_join(pr2, pr2_metadata)
db_disconnect(pr2_db_con)
[1] TRUE
3 Reference
- Arsenieff L., Le Gall F., Rigaut-Jalabert F., Mahé F., Sarno D., Gouhier L., Baudoux A-C., Simon N. 2020. Diversity and dynamics of relevant nanoplanktonic diatoms in the Western English Channel. The ISME Journal. DOI: 10.1038/s41396-020-0659-6.
4 Set up the files
target_group = c("Thalassiosira", "Minidiscus", "Guinardia")
target_level = "genus"
dir_pr2_update <- "../updates/2019 Arsenieff diatoms"
pr2.env$editor <- "L. Arsenieff"
full_path <- function(file_name) {
str_c(dir_pr2_update, "/", file_name)
}
file_pr2_update_excel <- full_path("pr2_diatoms_2019_11_21_LA.xlsx")
# create the directory for taxonomy output
dir.create(full_path("taxo"), showWarnings = FALSE)
5 Read the original data and reformat
5.1 Read the data
- Number of sequences = 274
pr2_update <- read_excel(file_pr2_update_excel, sheet = "updates", guess_max = 2e+05,
na = c("", "-"))
str_c("Number of sequences : ", nrow(pr2_update))
[1] "Number of sequences : 274"
5.2 Compare with sequences in PR2
str_c("Number of sequences of target group in pr2 : ", nrow(filter(pr2_18S, !!as.symbol(target_level) %in%
target_group)))
[1] "Number of sequences of target group in pr2 : 255"
str_c("Number of updated sequences that are not present in PR2 : ", nrow(filter(pr2_update,
!(genbank_accession %in% pr2_18S$genbank_accession))))
[1] "Number of updated sequences that are not present in PR2 : 19"
str_c("Number of updated sequences that are not active in PR2 : ", nrow(filter(pr2_update,
(genbank_accession %in% pr2_18S_removed$genbank_accession))))
[1] "Number of updated sequences that are not active in PR2 : 2"
[1] "Sequences of PR2 that are misclassified - df "
# A tibble: 3 x 2
genus n
<chr> <int>
1 Guinardia 13
2 Minidiscus 13
3 Thalassiosira 229
[1] "Sequences from target group in PR2 that are not in update - df "
filter(pr2_18S, (!!as.symbol(target_level) %in% target_group) & !(genbank_accession %in%
pr2_update$genbank_accession)) # %>%
# A tibble: 0 x 97
# ... with 97 variables: pr2_main_id <int>, pr2_accession <chr>,
# genbank_accession <chr>, start <dbl>, end <dbl>, label <chr>, gene <chr>,
# organelle <chr>, species <chr>, chimera <int>, chimera_remark <chr>,
# reference_sequence <int>, added_version <chr>, removed_version <chr>,
# edited_version <chr>, edited_by <chr>, edited_remark <chr>, remark <chr>,
# taxo_id <int>, kingdom <chr>, supergroup <chr>, division <chr>,
# class <chr>, order <chr>, family <chr>, genus <chr>,
# taxon_trophic_mode <chr>, taxo_edited_version <chr>, taxo_edited_by <chr>,
# taxo_removed_version <chr>, taxo_remark <chr>, reference <chr>,
# seq_id <int>, sequence <chr>, sequence_length <int>, ambiguities <int>,
# sequence_hash <chr>, gb_sequence <chr>, pr2_metadata_id <int>,
# gb_date <chr>, gb_locus <chr>, gb_definition <chr>, gb_organism <chr>,
# gb_organelle <chr>, gb_taxonomy <chr>, gb_strain <chr>,
# gb_culture_collection <chr>, gb_clone <chr>, gb_isolate <chr>,
# gb_isolation_source <chr>, gb_specimen_voucher <chr>, gb_host <chr>,
# gb_collection_date <chr>, gb_environmental_sample <chr>, gb_country <chr>,
# gb_lat_lon <chr>, gb_collected_by <chr>, gb_note <chr>,
# gb_references <chr>, gb_publication <chr>, gb_authors <chr>,
# gb_journal <chr>, pubmed_id <int>, eukref_name <chr>, eukref_source <chr>,
# eukref_env_material <chr>, eukref_env_biome <chr>,
# eukref_biotic_relationship <chr>, eukref_specific_host <chr>,
# eukref_geo_loc_name <chr>, eukref_notes <chr>, pr2_sample_type <chr>,
# pr2_sample_method <chr>, pr2_latitude <dbl>, pr2_longitude <dbl>,
# pr2_ocean <chr>, pr2_sea <chr>, pr2_sea_lat <dbl>, pr2_sea_lon <dbl>,
# pr2_continent <chr>, pr2_country <chr>, pr2_location <chr>,
# pr2_location_geoname <chr>, pr2_location_geotype <chr>,
# pr2_location_lat <dbl>, pr2_location_lon <dbl>, pr2_country_geocode <chr>,
# pr2_country_lat <dbl>, pr2_country_lon <dbl>, pr2_sequence_origin <chr>,
# pr2_size_fraction <chr>, pr2_size_fraction_min <dbl>,
# pr2_size_fraction_max <dbl>, metadata_remark <chr>, junk_gb_authors <chr>,
# junk_gb_publication <chr>, junk_gb_journal <chr>
# write_tsv(full_path('api_pr2_not_javier.tsv'), na='')
str_c("Number of PR2 discarded sequences in update : ", nrow(filter(pr2_18S_removed,
genbank_accession %in% pr2_update$genbank_accession)))
[1] "Number of PR2 discarded sequences in update : 2"
[1] "Sequences duplicated - df "
# A tibble: 0 x 2
# ... with 2 variables: genbank_accession <chr>, n <int>
[1] "Sequences updated with 2 entries in PR2 (e.g. with and without introns) - df "
6 Taxonomy
6.1 Build and check
pr2_taxo_updated <- pr2_update %>% group_by_at(pr2.env$taxo_levels) %>% count()
pr2_taxo_check(pr2_taxo_updated, full_path("taxo"))
pr2_taxo_updated <- pr2_taxo_updated %>% rename_all(~str_c(., "_new")) %>% rename(species = species_new) %>%
left_join(pr2_taxo) %>% rename_at(pr2.env$taxo_levels, ~str_c(., "_old")) %>%
rename_all(~str_replace(., "_new", "")) %>% rename(species = species_old)
write.xlsx(pr2_taxo_updated, full_path("taxo_updated.xlsx"))
6.2 Find taxa in PR2 that are not included in the update
7 New sequences
7.1 List of new sequences
- 17 new sequences
7.2 Run the script "
PR2 genbank dowmnload.R` in background
7.3 Read the metadata files and join by genbank accession
metafile <- list()
for (i in c(1:2)) {
metafile[[i]] <- read_tsv(full_path(str_c("pr2_gb_metadata_", i, ".txt")), col_type = cols(gb_collection_date = col_character())) %>%
mutate(gb_isolate = as.character(gb_isolate))
}
# The first file serves as a template for the following one when binding rows
gb_metadata <- reduce(metafile, bind_rows) %>% distinct() %>% rename(gb_sequence = sequence)
pr2_new <- left_join(pr2_new, gb_metadata, by = c(genbank_accession = "genbank_accession"))
7.4 Read the features file and join
gb_features <- read_tsv(full_path("pr2_gb_features.txt")) %>% filter(product == "18S ribosomal RNA") %>%
rename(start_feature = start, end_feature = end) %>% distinct()
pr2_new <- left_join(pr2_new, gb_features, by = c(genbank_accession = "genbank_accession"))
write.xlsx(pr2_new, full_path("pr2_new.xlsx"), na = "")
8 Finalization
8.1 Taxo
8.2 Old sequences
pr2_update_old <- pr2_update %>% filter((genbank_accession %in% pr2$genbank_accession))
pr2_species_old <- pr2_18S %>% select(genbank_accession, species) %>% rename(species_old = species)
pr2_main_old <- pr2_update_old %>% select(genbank_accession, species, chimera, edited_remark,
removed_version) %>% mutate(edited_version = pr2.env$version, edited_by = pr2.env$editor) %>%
left_join(pr2_species_old) %>% filter((species != species_old) | (chimera ==
1)) # only keep sequences for which species has been changed or labelled as chimera
nrow(pr2_main_old)
[1] 2
8.3 New sequences
8.3.1 Read new sequences once the corrections have been done
8.3.2 Add fields
pr2_new <- pr2_new %>% # mutate(start=1, end=sequence_length, label = case_when (intron == 'intron' ~
# 'G', TRUE ~'U')) %>%
mutate(sequence = gb_sequence, sequence_length = str_length(sequence), label = "U",
pr2_accession = str_c(genbank_accession, ".", start, ".", end, "_", label), reference_sequence = NA,
gene = "18S rRNA", organelle = "nucleus", ambiguities = str_count(sequence, pattern = pr2.env$ambig_regex))
pr2_new$sequence_hash = purrr::map_chr(pr2_new$sequence, digest::sha1)
8.3.3 Remove sequences with ambiguities or too short
filter(pr2_new, (sequence_length < pr2.env$sequence_length_min) | (ambiguities >
pr2.env$ambiguities_max)) %>% arrange(desc(sequence_length)) %>% select(pr2_accession,
class, genus, sequence_length, ambiguities, species, gb_definition)
# A tibble: 0 x 7
# ... with 7 variables: pr2_accession <chr>, class <chr>, genus <chr>,
# sequence_length <int>, ambiguities <int>, species <chr>,
# gb_definition <chr>
8.3.4 Check duplicate entries
# A tibble: 0 x 2
# ... with 2 variables: pr2_accession <chr>, n_seq <int>
8.3.5 Final files for uploading
[1] " Final number of sequences added: 17"
[1] "genbank_accession" "start"
[3] "end" "label"
[5] "gene" "organelle"
[7] "kingdom" "supergroup"
[9] "division" "class"
[11] "order" "family"
[13] "genus" "species"
[15] "sequence" "gb_definition"
[17] "gb_organism" "gb_organelle"
[19] "gb_strain" "gb_isolate"
[21] "gb_clone" "gb_specimen_voucher"
[23] "gb_collected_by" "gb_lat_lon"
[25] "gb_note" "gb_culture_collection"
[27] "gb_isolation_source" "gb_host"
[29] "gb_environmental_sample" "gb_collection_date"
[31] "gb_country" "gb_date"
[33] "gb_locus" "gb_sequence"
[35] "pr2_sample_type" "sequence_length"
[37] "pr2_longitude" "pr2_latitude"
[39] "seqnames" "start.y"
[41] "end.y" "width"
[43] "strand" "type"
[45] "product" "loctype"
[47] "pr2_accession" "reference_sequence"
[49] "ambiguities" "sequence_hash"
pr2_update_new <- pr2_new
pr2_main_new <- pr2_update_new %>% select(pr2_accession, genbank_accession, start,
end, label, species, organelle, gene) %>% mutate(added_version = pr2.env$version,
edited_by = pr2.env$editor)
nrow(pr2_main_new)
[1] 17
pr2_sequences_new <- pr2_update_new %>% select(pr2_accession, sequence, gb_sequence,
sequence_length, ambiguities, sequence_hash)
nrow(pr2_sequences_new)
[1] 17
pr2_metadata_new <- pr2_update_new %>% select(genbank_accession, starts_with("gb_"),
starts_with("pr2_"), starts_with("eukref_")) %>% select(-pr2_accession, -gb_sequence) %>%
distinct()
nrow(pr2_metadata_new)
[1] 17
9 Save everything to an Excel file
file_pr2_imports <- full_path("pr2_imports_final.xlsx")
onglets <- list("pr2_taxo_updated" = pr2_taxo_updated,
# "pr2_taxo_remove" = pr2_taxo_remove,
"pr2_main_old" = pr2_main_old,
# "pr2_metadata_old" = pr2_metadata_old,
"pr2_main_new" = pr2_main_new,
"pr2_sequence_new" = pr2_sequences_new,
"pr2_metadata_new" = pr2_metadata_new)
write.xlsx(onglets, file_pr2_imports)