PR2 version 4.13.0
Adding GenBank sequences
PR2 version 4.13.0
Adding GenBank sequences
1 Init
Load the variables common to the different scripts and the necessary libraries
2 Read pr2
- Use the PR2 from Google database
# Info for the local dabase
pr2_db <- db_info("pr2_google")
pr2_db_con <- db_connect(pr2_db)
pr2_main <- tbl(pr2_db_con, "pr2_main") %>% collect()
# pr2_main <- pr2_main %>% filter (is.na(removed_version))
# pr2_seq <- tbl(pr2_db_con, "pr2_sequences")%>% collect()
pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy")
pr2_taxo <- pr2_taxo %>% filter (is.na(taxo_removed_version))%>% collect()
pr2_metadata <- tbl(pr2_db_con, "pr2_metadata")%>% collect()
# Join the tables and keep only sequences that are not removed
pr2 <- left_join(pr2_main, pr2_taxo, by = c("species"="species"))
# pr2 <- left_join (pr2, pr2_seq)
# pr2 <- left_join (pr2, pr2_metadata)
db_disconnect(pr2_db_con)
## [1] TRUE
3 Set up the files
dir_pr2_update <- "../updates/2020 4.13.0 GenBank"
pr2.env$editor <- "D. Vaulot"
full_path <- function(file_name){str_c(dir_pr2_update,"/", file_name)}
# file_fasta <- full_path("arb-silva.de_2020-05-18_id824681_tax_silva samples.fasta")
file_fasta <- full_path("arb-silva.de_2020-05-18_id824681_tax_silva.fasta.gz")
# create the directory for taxonomy output
dir.create(full_path("taxo"), showWarnings = FALSE)
4 Get sequence list from GenBank
pr2_update <- genbank_search(query = "18S ribosomal RNA gene[TITL] NOT internal transcribed spacer[TITL] NOT homo[TITL] AND 2018/11/16:2018/12/31[Publication Date]", seq_max = 10000)
write.xlsx(pr2_update, full_path("pr2_genbank_new_2018.xlsx"),
zoom = 90, firstRow = TRUE, firstCol = TRUE)
pr2_update <- genbank_search(query = "18S ribosomal RNA gene[TITL] NOT internal transcribed spacer[TITL] NOT homo[TITL] AND 2019[Publication Date]", seq_max = 10000)
write.xlsx(pr2_update, full_path("pr2_genbank_new_2019.xlsx"),
zoom = 90, firstRow = TRUE, firstCol = TRUE)
pr2_update <- genbank_search(query = "18S ribosomal RNA gene[TITL] NOT internal transcribed spacer[TITL] NOT homo[TITL] AND 2020[Publication Date]", seq_max = 10000)
write.xlsx(pr2_update, full_path("pr2_genbank_new_2020.xlsx"),
zoom = 90, firstRow = TRUE, firstCol = TRUE)
pr2_update_2018 <- read_xlsx(full_path("pr2_genbank_new_2018.xlsx"))
pr2_update_2019 <- read_xlsx(full_path("pr2_genbank_new_2019.xlsx"))
pr2_update_2020 <- read_xlsx(full_path("pr2_genbank_new_2020.xlsx"))
pr2_update <- bind_rows(pr2_update_2018, pr2_update_2019, pr2_update_2020)
4.1 Compare with sequences in PR2
str_c("Number of updated sequences that are not present in PR2 : ",
nrow(filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))))
## [1] "Number of updated sequences that are not present in PR2 : 12504"
str_c("Number of updated sequences that are not active in PR2 : ",
nrow(filter(pr2_update, (genbank_accession %in% pr2_18S_removed$genbank_accession))))
## [1] "Number of updated sequences that are not active in PR2 : 6"
## [1] "Sequences duplicated - df "
## # A tibble: 0 x 2
## # ... with 2 variables: genbank_accession <chr>, n <int>
5 New sequences
5.1 Filter new sequences
5.2 Save the list of new sequences
5.3 Run the script PR2 genbank download.R in background
5.4 Read the metadata files and join by genbank accession
metafile <- list()
for (i in c(1:10)) {
metafile[[i]] <- read_tsv(full_path(str_c("metadata/pr2_gb_metadata_3_",i,".txt")),
col_type = cols(gb_collection_date = col_character())) %>%
mutate(gb_isolate=as.character(gb_isolate))
}
# The first file serves as a template for the following one when binding rows
pr2_new <- reduce(metafile, bind_rows) %>%
distinct() %>%
filter(!str_detect(gb_definition, "transcribed spacer")) %>%
filter(sequence_length >= pr2.env$sequence_length_min & sequence_length <= 3000)
6 Finalization
6.1 Read back the taxo corrected
6.2 New sequences
6.2.1 Read new sequences once the corrections have been done
- Sequences with ITS region removed
- Sequences with introns removed
6.2.2 Add fields
pr2_new <- pr2_new %>%
mutate(start=1, end=sequence_length,
label = "U",
pr2_accession = str_c(genbank_accession,".",start,".", end,"_",label),
reference_sequence = NA,
gene=case_when(str_detect(gb_definition, "nucleomorph") ~ "18S_rRNA",
str_detect(gb_definition, "chloroplast") ~ "16S_rRNA",
TRUE ~ "18S_rRNA"),
organelle = case_when(str_detect(gb_definition, "nucleomorph") ~ "nucleomorph",
str_detect(gb_definition, "chloroplast") ~ "plastid",
TRUE ~ "nucleus"),
ambiguities = str_count(gb_sequence, pattern=pr2.env$ambig_regex),
pr2_sample_type = case_when(str_detect(gb_definition, "uncultured|Uncultured") ~ "environmental",
TRUE ~ pr2_sample_type)
)
pr2_new$sequence_hash = purrr::map_chr(pr2_new$gb_sequence,digest::sha1)
6.2.3 Remove sequences with ambiguities or too short
filter(pr2_new, (sequence_length < pr2.env$sequence_length_min)|(ambiguities > pr2.env$ambiguities_max)) %>%
arrange(desc(sequence_length)) %>%
select(pr2_accession, sequence_length,ambiguities, gb_definition)
## # A tibble: 27 x 4
## pr2_accession sequence_length ambiguities gb_definition
## <chr> <dbl> <int> <chr>
## 1 MG972345.1.287~ 2875 103 Chaetoceros sp. Na12A3 isolate N~
## 2 MG972343.1.287~ 2875 103 Chaetoceros sp. Na12A3 18S ribos~
## 3 MG972335.1.268~ 2689 131 Chaetoceros diversus isolate Na3~
## 4 MG972337.1.262~ 2621 130 Chaetoceros diversus isolate Na1~
## 5 MG972364.1.245~ 2455 87 Chaetoceros sp. Na28A1 18S ribos~
## 6 KL780694.1.243~ 2432 100 Manihot esculenta cultivar KU50 ~
## 7 MF497377.1.235~ 2356 830 Phryganella paradoxa isolate KD1~
## 8 MG972365.1.220~ 2204 120 Chaetoceros cf. vixvisibilis iso~
## 9 MN367945.1.211~ 2115 140 Bembidion platyderoides voucher ~
## 10 MG972366.1.209~ 2097 74 Chaetoceros cf. vixvisibilis iso~
## # ... with 17 more rows
6.2.4 Check duplicate entries
## # A tibble: 0 x 2
## # ... with 2 variables: pr2_accession <chr>, n <int>
6.3 Get some statistics
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
6.3.1 Final files for uploading
FInal number of sequences: 7032
## [1] " Final number of sequences added: 7032"
## [1] "genbank_accession" "gb_definition"
## [3] "gb_organism" "gb_strain"
## [5] "gb_organelle" "gb_isolate"
## [7] "gb_clone" "gb_specimen_voucher"
## [9] "gb_collected_by" "gb_lat_lon"
## [11] "gb_note" "gb_culture_collection"
## [13] "gb_isolation_source" "gb_host"
## [15] "gb_environmental_sample" "gb_collection_date"
## [17] "gb_country" "gb_date"
## [19] "pr2_longitude" "pr2_latitude"
## [21] "pr2_sample_type" "gb_sequence"
## [23] "sequence_length" "start"
## [25] "end" "label"
## [27] "pr2_accession" "reference_sequence"
## [29] "gene" "organelle"
## [31] "ambiguities" "sequence_hash"
pr2_update_new <- pr2_new
pr2_main_new <- pr2_update_new %>%
select (pr2_accession, genbank_accession, start,end, label, organelle, gene) %>%
mutate(added_version = pr2.env$version, edited_by = pr2.env$editor)
nrow(pr2_main_new)
## [1] 7032
pr2_sequences_new <- pr2_update_new %>%
select(pr2_accession, gb_sequence, sequence_length, ambiguities, sequence_hash)
nrow(pr2_sequences_new)
## [1] 7032
pr2_metadata_new <- pr2_update_new %>%
select(genbank_accession, starts_with("gb_"),starts_with("pr2_"), starts_with("eukref_")) %>%
select( -pr2_accession, -gb_sequence) %>%
distinct()
nrow(pr2_metadata_new)
## [1] 7032
7 Save everything to an Excel file
file_pr2_imports <- full_path("pr2_imports_final.xlsx")
onglets <- list(#"pr2_taxo_updated" = pr2_taxo_updated,
# "pr2_taxo_remove" = pr2_taxo_remove,
#"pr2_main_old" = pr2_main_old,
# "pr2_metadata_old" = pr2_metadata_old,
"pr2_main_new" = pr2_main_new,
"pr2_sequence_new" = pr2_sequences_new,
"pr2_metadata_new" = pr2_metadata_new)
write.xlsx(onglets, file_pr2_imports, zoom = 90, firstRow = TRUE, firstCol = TRUE)
8 Get metadata for which no metadata are available
8.1 Find sequences for which no metadata
8.2 Save the list of new sequences
8.3 Run the script PR2 genbank download.R in background
8.4 Read the metadata files and join by genbank accession
metafile <- list()
for (i in c(1:5)) {
metafile[[i]] <- read_tsv(full_path(str_c("metadata/pr2_gb_metadata_4_",i,".txt")),
col_type = cols(gb_collection_date = col_character())) %>%
mutate(gb_isolate=as.character(gb_isolate))
}
# The first file serves as a template for the following one when binding rows
pr2_metadata_old <- reduce(metafile, bind_rows) %>%
distinct()
pr2_metadata_old %>% count(genbank_accession) %>% filter (n>1)
## # A tibble: 0 x 2
## # ... with 2 variables: genbank_accession <chr>, n <int>