PR2 version 4.13.0
Adding RCC sequences
PR2 version 4.13.0
Adding RCC sequences
1 Init
Load the variables common to the different scripts and the necessary libraries
2 Read pr2
- Use the PR2 from Google database
# Info for the local dabase
pr2_db <- db_info("pr2_google")
pr2_db_con <- db_connect(pr2_db)
pr2_main <- tbl(pr2_db_con, "pr2_main") %>% collect()
# pr2_main <- pr2_main %>% filter (is.na(removed_version))
# pr2_seq <- tbl(pr2_db_con, "pr2_sequences")%>% collect()
pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy")
pr2_taxo <- pr2_taxo %>% filter (is.na(taxo_removed_version))%>% collect()
# pr2_metadata <- tbl(pr2_db_con, "pr2_metadata")%>% collect()
# Join the tables and keep only sequences that are not removed
pr2 <- left_join(pr2_main, pr2_taxo, by = c("species"="species"))
# pr2 <- left_join (pr2, pr2_seq)
# pr2 <- left_join (pr2, pr2_metadata)
db_disconnect(pr2_db_con)
## [1] TRUE
3 Set up the files
4 RCC database
4.1 Read RCC
db_rcc <- db_connect(db_info("rcc_local") )
cultures <- tbl(db_rcc, "cultures") %>%
collect()
taxonomy <- tbl(db_rcc, "taxonomy") %>%
collect()
cultures <- left_join(cultures, taxonomy)
# cultures <- cultures %>% filter(lost==0)
cultures <- cultures %>% mutate(year_entered = lubridate::year(date_entered_catalog),
year_sampled = lubridate::year(lubridate::parse_date_time(sampling_date, "ymd")),
year_lost = lubridate::year(lost_date),
latitude = mapply(lat_long_dec,sampling_lat_deg,sampling_lat_mn,sampling_lat_ns,"latitude"),
longitude = mapply(lat_long_dec,sampling_long_deg,sampling_long_mn,sampling_long_ew,"longitude"))
sequences <- tbl(db_rcc, "sequences") %>% collect()
db_disconnect(db_rcc)
# Build some tables -------------------------------------------------------
sequences_18S <- sequences %>%
filter(str_detect(gene_name, "18S")) %>%
select(rcc_id, genbank_accession, sequence) %>%
rename(accession_18S = genbank_accession, sequence_18S = sequence) %>%
mutate(sequence_18S_length = str_length(sequence_18S))
sequences_18S_longer <- sequences_18S %>%
arrange(rcc_id, desc(sequence_18S_length)) %>%
distinct(rcc_id, .keep_all = TRUE)
4.2 Construct and export tables
pr2_update <- cultures %>%
rename(kingdom=domain) %>%
arrange(kingdom, division, class, order, family, genus, species, rcc_id) %>%
select(kingdom, division, class, order, family, genus, species, rcc_id, lost_date,
strain_name, strain_name_synonyms,
sampling_ocean, sampling_regional_sea , sampling_depth,
sampling_date, latitude, longitude) %>%
left_join(sequences_18S_longer) %>%
rename(genbank_accession = accession_18S) %>%
filter(!(genbank_accession %in% pr2$genbank_accession ) & !is.na(genbank_accession)) %>%
# left_join(sequences_ITS) %>%
select_if(~!all(is.na(.))) %>% # Remove all empty columns
filter(sequence_18S_length >= pr2.env$sequence_length_min) %>% # Remove sequences that are too short
mutate(species = str_replace(species, "_sp", "_sp.")) %>% # Replace sp by sp.
mutate(species = str_replace(species, "_cf.+", "_sp.")) %>% # Replace cf_xxx by sp.
filter(!str_detect(species, "Eukaryota_XXXXX")) # Remove unassigned sequences
write.xlsx(pr2_update, full_path("pr2_rcc.xlsx"), zoom = 90, firstActiveRow = 2, firstActiveCol = 9, colWidths = "auto")
4.3 Reload the file after manual correction
4.4 Compare with sequences in PR2
str_c("Number of updated sequences that are not present in PR2 : ",
nrow(filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))))
## [1] "Number of updated sequences that are not present in PR2 : 1244"
str_c("Number of updated sequences that are not active in PR2 : ",
nrow(filter(pr2_update, (genbank_accession %in% pr2_18S_removed$genbank_accession))))
## [1] "Number of updated sequences that are not active in PR2 : 0"
## [1] "Sequences duplicated - df "
## [1] genbank_accession n
## <0 rows> (or 0-length row.names)
5 Taxonomy
5.1 Build and check
# Only 7 levels in RCC
taxo_levels <- c(pr2.env$taxo_levels[1], pr2.env$taxo_levels[3:8])
pr2_taxo_updated <- pr2_update %>%
group_by_at(taxo_levels) %>%
count() %>%
ungroup()
pr2_taxo_check(pr2_taxo_updated, taxo_levels = taxo_levels, full_path("taxo"))
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `transmute_()` is deprecated as of dplyr 0.7.0.
## Please use `transmute()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Joining, by = "name"
pr2_taxo_updated <- pr2_taxo_updated %>%
rename_all(~ str_c(.,"_new" )) %>%
rename(species = species_new) %>%
left_join(pr2_taxo) %>%
rename_at(pr2.env$taxo_levels, ~ str_c(.,"_old" )) %>%
rename_all( ~ str_replace(.,"_new", "" )) %>%
rename(species = species_old)
## Joining, by = "species"
5.2 For taxa not yet in PR2 find if genus is present
pr2_taxo_to_add <- pr2_taxo_updated %>%
filter(is.na(taxo_id)) %>%
select(genus, species, n, taxo_edited_version, taxo_edited_by) %>%
left_join(select(pr2_taxo, kingdom:genus)) %>%
distinct()
## Joining, by = "genus"
6 New sequences
6.1 List of new sequences
6.2 Run the script PR2 genbank download.R in background
6.3 Read the metadata files and join by genbank accession
metafile <- list()
for (i in c(1:10)) {
metafile[[i]] <- read_tsv(full_path(str_c("pr2_gb_metadata_",i,".txt")),
col_type = cols(gb_collection_date = col_character())) %>%
mutate(gb_isolate=as.character(gb_isolate))
}
# The first file serves as a template for the following one when binding rows
gb_metadata <- reduce(metafile, bind_rows) %>%
distinct() %>%
rename(gb_sequence = sequence)
pr2_new <- left_join(pr2_new, gb_metadata, by = c("genbank_accession" = "genbank_accession"))
6.4 Read the features file and join
gb_features <- read_tsv(full_path("pr2_gb_features.txt")) %>%
filter(product == "18S ribosomal RNA") %>%
rename(start_feature = start, end_feature = end) %>%
distinct()
## Parsed with column specification:
## cols(
## seqnames = col_character(),
## start = col_double(),
## end = col_double(),
## width = col_double(),
## strand = col_character(),
## type = col_character(),
## product = col_character(),
## loctype = col_character(),
## genbank_accession = col_character(),
## note = col_character()
## )
7 Finalization
7.1 Read back the taxo corrected
7.2 New sequences
7.2.1 Read new sequences once the corrections have been done
- Sequences with ITS region removed
- Sequences with introns removed
7.2.2 Add fields
pr2_new <- pr2_new %>%
# mutate(start=1, end=sequence_length,
# label = case_when (intron == "intron" ~ "G",
# TRUE ~"U")) %>%
mutate(sequence=str_sub(gb_sequence,start,end),
sequence_length = str_length(sequence),
label = "U",
pr2_accession = str_c(genbank_accession,".",start,".", end,"_",label),
reference_sequence = NA,
gene=case_when(str_detect(gb_definition, "nucleomorph") ~ "18S_rRNA",
str_detect(gb_definition, "chloroplast") ~ "16S_rRNA",
TRUE ~ "18S_rRNA"),
organelle = case_when(str_detect(gb_definition, "nucleomorph") ~ "nucleomorph",
str_detect(gb_definition, "chloroplast") ~ "plastid",
TRUE ~ "nucleus"),
ambiguities = str_count(sequence,pattern=pr2.env$ambig_regex)
)
pr2_new$sequence_hash = purrr::map_chr(pr2_new$sequence,digest::sha1)
7.2.3 Remove sequences with ambiguities or too short
filter(pr2_new, (sequence_length < pr2.env$sequence_length_min)|(ambiguities > pr2.env$ambiguities_max)) %>%
arrange(desc(sequence_length)) %>%
select(pr2_accession, species, sequence_length,ambiguities, species,gb_definition)
## # A tibble: 1 x 5
## pr2_accession species sequence_length ambiguities gb_definition
## <chr> <chr> <int> <int> <chr>
## 1 MF179484.1.18~ Hemiselm~ 1847 34 Hemiselmis virescens vou~
7.2.4 Check duplicate entries
## # A tibble: 0 x 2
## # ... with 2 variables: pr2_accession <chr>, n <int>
7.2.5 Final files for uploading
FInal number of sequences: 1129
## [1] " Final number of sequences added: 1129"
## [1] "genbank_accession" "pr2_longitude"
## [3] "pr2_latitude" "sampling_date"
## [5] "pr2_depth" "pr2_sea"
## [7] "pr2_ocean" "strain_name"
## [9] "rcc_id" "species"
## [11] "gb_definition" "gb_organism"
## [13] "gb_organelle" "gb_strain"
## [15] "gb_isolate" "gb_clone"
## [17] "gb_specimen_voucher" "gb_collected_by"
## [19] "gb_lat_lon" "gb_note"
## [21] "gb_culture_collection" "gb_isolation_source"
## [23] "gb_host" "gb_environmental_sample"
## [25] "gb_collection_date" "gb_country"
## [27] "gb_date" "gb_locus"
## [29] "gb_sequence" "pr2_sample_type"
## [31] "sequence_length" "seqnames"
## [33] "start" "end"
## [35] "start_feature" "end_feature"
## [37] "width" "strand"
## [39] "type" "product"
## [41] "loctype" "note"
## [43] "sequence" "label"
## [45] "pr2_accession" "reference_sequence"
## [47] "gene" "organelle"
## [49] "ambiguities" "sequence_hash"
pr2_update_new <- pr2_new
pr2_main_new <- pr2_update_new %>%
select (pr2_accession, genbank_accession, start,end, label, species, organelle, gene) %>%
mutate(added_version = pr2.env$version, edited_by = pr2.env$editor)
nrow(pr2_main_new)
## [1] 1129
pr2_sequences_new <- pr2_update_new %>%
select(pr2_accession, sequence, gb_sequence, sequence_length, ambiguities, sequence_hash)
nrow(pr2_sequences_new)
## [1] 1129
pr2_metadata_new <- pr2_update_new %>%
select(genbank_accession, starts_with("gb_"),starts_with("pr2_"), starts_with("eukref_")) %>%
select( -pr2_accession, -gb_sequence) %>%
distinct()
nrow(pr2_metadata_new)
## [1] 1129
8 Save everything to an Excel file
file_pr2_imports <- full_path("pr2_imports_final.xlsx")
onglets <- list("pr2_taxo_updated" = pr2_taxo_updated,
# "pr2_taxo_remove" = pr2_taxo_remove,
#"pr2_main_old" = pr2_main_old,
# "pr2_metadata_old" = pr2_metadata_old,
"pr2_main_new" = pr2_main_new,
"pr2_sequence_new" = pr2_sequences_new,
"pr2_metadata_new" = pr2_metadata_new)
write.xlsx(onglets, file_pr2_imports, zoom = 90, firstRow = TRUE, firstCol = TRUE)
## Warning in file.create(to[okay]): impossible de créer le fichier '../updates/
## 2020 4.13.0 RCC/pr2_imports_final.xlsx', à cause de 'Permission denied'