library("ggplot2")
library("dplyr")
library("knitr")
library("forcats")
library("stringr")
library("rworldmap")
library("treemapify")
library("pr2database")
packageVersion("pr2database")
#> [1] '5.1.0'
pr2 <- pr2_database()
pr2_photo <- pr2 %>%
filter((division %in% c("Chlorophyta", "Dinophyta",
"Cryptophyta",
"Haptophyta", "Ochrophyta")) &
!(class %in% c("Syndiniales", "Sarcomonadea")))
pr2_ref <- pr2 %>% filter(!is.na(reference_sequence))
PR2 fields
colnames(pr2)
#> [1] "pr2_main_id" "pr2_accession"
#> [3] "domain" "supergroup"
#> [5] "division" "subdivision"
#> [7] "class" "order"
#> [9] "family" "genus"
#> [11] "species" "genbank_accession"
#> [13] "start" "end"
#> [15] "label" "gene"
#> [17] "organelle" "species_version_5.0"
#> [19] "reference_sequence" "remark"
#> [21] "mixoplankton" "HAB_species"
#> [23] "ecological_function" "worms_id"
#> [25] "worms_marine" "worms_brackish"
#> [27] "worms_freshwater" "worms_terrestrial"
#> [29] "gbif_id" "sequence"
#> [31] "sequence_length" "ambiguities"
#> [33] "sequence_hash" "gb_date"
#> [35] "gb_division" "gb_definition"
#> [37] "gb_organism" "gb_organelle"
#> [39] "gb_taxonomy" "gb_strain"
#> [41] "gb_culture_collection" "gb_clone"
#> [43] "gb_isolate" "gb_isolation_source"
#> [45] "gb_specimen_voucher" "gb_host"
#> [47] "gb_collection_date" "gb_environmental_sample"
#> [49] "gb_country" "gb_lat_lon"
#> [51] "gb_collected_by" "gb_note"
#> [53] "gb_publication" "gb_authors"
#> [55] "gb_journal" "eukref_name"
#> [57] "eukref_source" "eukref_env_material"
#> [59] "eukref_env_biome" "eukref_biotic_relationship"
#> [61] "eukref_specific_host" "eukref_geo_loc_name"
#> [63] "eukref_notes" "pr2_sample_type"
#> [65] "pr2_sample_method" "pr2_latitude"
#> [67] "pr2_longitude" "pr2_depth"
#> [69] "pr2_ocean" "pr2_sea"
#> [71] "pr2_sea_lat" "pr2_sea_lon"
#> [73] "pr2_country" "pr2_location"
#> [75] "pr2_location_geoname" "pr2_location_geotype"
#> [77] "pr2_location_lat" "pr2_location_lon"
#> [79] "pr2_sequence_origin" "pr2_size_fraction"
#> [81] "metadata_remark" "pr2_continent"
#> [83] "pr2_country_geocode" "pr2_country_lat"
#> [85] "pr2_country_lon" "eukribo_UniEuk_taxonomy_string"
#> [87] "eukribo_V4" "eukribo_V9"
#> [89] "silva_taxonomy" "organelle_code"
#> [91] "species_rod" "infraspecific_name"
#> [93] "isolate" "assembly_level"
#> [95] "assembly_type" "gc_percent"
#> [97] "pubmed_id" "taxo_id"
#> [99] "species_url" "accession_genbank_link"
Basic statistics
All taxa
Total number of PR2 sequences : 240199
pr2_taxa <- pr2 %>% select(domain:genus, species) %>%
summarise_all(funs(n_distinct(.)))
knitr::kable(pr2_taxa, caption="Number of taxa - all sequences")
domain | supergroup | division | subdivision | class | order | family | genus | species |
---|---|---|---|---|---|---|---|---|
8 | 39 | 105 | 140 | 449 | 1136 | 2423 | 26265 | 54122 |
Photosynthetic protists
Number of photosynthetic protist sequences : 15412
pr2_taxa <- pr2_photo %>% select(domain:genus, species) %>%
summarise_all(funs(n_distinct(.)))
knitr::kable(pr2_taxa, caption="Number of taxa - photosynthetic protist sequences")
domain | supergroup | division | subdivision | class | order | family | genus | species |
---|---|---|---|---|---|---|---|---|
1 | 3 | 3 | 3 | 24 | 62 | 94 | 516 | 1822 |
Reference sequences
Reference sequences are a subset of PR2 representative of taxonomic groups.
Number of reference sequences : 240199
pr2_taxa <- pr2_ref %>% select(domain:genus, species) %>%
summarise_all(funs(n_distinct(.)))
knitr::kable(pr2_taxa, caption="Number of taxa - Reference sequences")
domain | supergroup | division | subdivision | class | order | family | genus | species |
---|---|---|---|---|---|---|---|---|
8 | 39 | 105 | 140 | 449 | 1136 | 2423 | 26265 | 54122 |
Sequence length
ggplot(pr2) + geom_histogram(aes(sequence_length), binwidth = 100, fill="blue") +
xlim(0,3000) + xlab("PR2 sequence length") +
ylab("Number of sequences") +
ggtitle("All sequences")
ggplot(pr2_ref) + geom_histogram(aes(sequence_length), binwidth = 100, fill="blue") +
xlim(0,3000) +
xlab("PR2 sequence length") +
ylab("Number of sequences") +
ggtitle("Reference sequences")
Taxonomic composition
pr2_treemap <- function(pr2, level1, level2) {
# Group
pr2_class <- pr2 %>%
count({{level1}},{{level2}}) %>%
filter(!is.na(division)) %>%
ungroup()
# Do a treemap
ggplot(pr2_class, aes(area = n,
fill = {{level2}},
subgroup = {{level1}},
label = {{level2}})) +
treemapify::geom_treemap()
ggplot(pr2_class, aes(area = n,
fill= {{level1}},
subgroup = {{level1}},
label = {{level2}})) +
treemapify::geom_treemap() +
treemapify::geom_treemap_text(colour = "white", place = "centre", grow = TRUE) +
treemapify::geom_treemap_subgroup_border() +
treemapify::geom_treemap_subgroup_text(place = "centre", grow = T,
alpha = 0.5, colour = "black",
min.size = 0) +
theme_bw() +
scale_color_brewer() +
guides(fill = FALSE)
}
Genera most represented
All taxa
pr2_genus <- pr2 %>% group_by(class, genus) %>%
count() %>%
ungroup() %>%
top_n(30)
ggplot(pr2_genus) +
geom_col(aes(x=forcats::fct_reorder(stringr::str_c(class,"-",genus), n), y=n)) +
coord_flip() +
ggtitle("Most represented genera - all") +
xlab("Genera") + ylab("Number of sequences")
Reference sequences
pr2_genus <- pr2_ref %>%
group_by(class, genus) %>%
count() %>% ungroup() %>%
top_n(30)
ggplot(pr2_genus) +
geom_col(aes(x=forcats::fct_reorder(stringr::str_c(class,"-",genus), n), y=n)) +
coord_flip() +
ggtitle("Reference sequences") +
xlab("Genera") +
ylab("Number of sequences")
Only photosynthetic protists
pr2_genus <- pr2_photo %>%
group_by(class, genus) %>%
count() %>%
ungroup() %>%
top_n(30)
ggplot(pr2_genus) +
geom_col(aes(x=forcats::fct_reorder(stringr::str_c(class,"-",genus), n), y=n)) +
coord_flip() +
ggtitle("Most represented genera - only photosynthetic protists") +
xlab("Genera") + ylab("Number of sequences")
World sequence distribution
map_get_world <- function(resolution="coarse"){
# Change to "coarse" for global maps / "low" for regional maps
worldMap <- rworldmap::getMap(resolution = resolution)
world.points <- fortify(worldMap)
world.points$region <- world.points$id
world.df <- world.points[,c("long","lat","group", "region")]
}
map_world <- function(color_continents = "grey80",
color_borders = "white",
resolution = "coarse") {
# Background map using the maps package
# world.df <- map_data("world")
world.df <- map_get_world(resolution)
map <- ggplot() +
geom_polygon(data = world.df,
aes(x=long, y = lat, group = group),
fill=color_continents,
color=color_borders) +
# scale_fill_manual(values= color_continents , guide = FALSE) +
scale_x_continuous(breaks = (-4:4) * 45) +
scale_y_continuous(breaks = (-2:2) * 30) +
xlab("Longitude") + ylab("Latitude") +
coord_fixed(1.3) +
theme_bw()
# species_map <- species_map + coord_map () # Mercator projection
# species_map <- species_map + coord_map("gilbert") # Nice for the poles
return(map)
}
All taxa
map_world() + geom_point(data=pr2, aes(x=pr2_longitude, y=pr2_latitude),
fill="blue", size=2, shape=21) +
ggtitle("PR2 - all sequences")
Photosynthetic protists
map_world() + geom_point(data=pr2_photo, aes(x=pr2_longitude, y=pr2_latitude),
fill="red", size=2, shape=21) +
ggtitle("PR2 - photosynthetic protists sequences")