PR2
Adding GenBank sequences
1 Instructions to add new Genbank sequences
- Do a query using genbank_search query
- Filter for sequences not present in PR2
- Save as an rds file
- Move rds to Roscoff server and run script called script_genbank.R using sbatch_genbank.sh
- Get the metadata and sequences
- Get the features
- MOve metadata and features result file to local directory
- Run rest of script to filter the data and save as an Excel file
- If find features files then only get the rRNA part
2 Init
source('PR2_init.R', echo=FALSE)
source('PR2_read_google.R', echo=FALSE)
3 Set up the files
<- "4.14/GenBank"
dir_pr2_update
$editor <- "D. Vaulot"
pr2.env
<- function(file_name){str_c(dir_pr2_update,"/", file_name)}
full_path
# create the directory for taxonomy output
dir.create(full_path(""), showWarnings = FALSE)
4 Part I - Get sequence list from GenBank and save as rds file
Skip this part if you have already the accession number list
4.1 Genbank search
<- list()
pr2_update
= "NOT homo[TITL] NOT Streptophyta[ORGN] NOT Fungi[ORGN] NOT Metazoa[ORGN] NOT Bacteria[ORGN] NOT mitochondrion[TITL]"
query_exclude
= "(18S ribosomal RNA gene[TITL] OR small subunit ribosomal RNA gene[TITL])"
query_include
= "AND 2020/05/01:2021/03/23[Publication Date]"
query_date
1]] <- genbank_search(query = str_c(query_include, query_exclude, query_date, sep=" "), seq_max = 15000)
pr2_update[[2]] <- genbank_search(query = "small subunit ribosomal RNA gene[TITL] AND Cafeteria[ORGN]", seq_max = 10000)
pr2_update[[# pr2_update[[3]] <- genbank_search(query = "MN315515[ACCN]", seq_max = 10000)
<- reduce(pr2_update, bind_rows)
pr2_update_all
# export(pr2_update_all, full_path("pr2_genbank_new_2021.xlsx"),
# zoom = 90, firstRow = TRUE, firstCol = TRUE)
4.2 Filter new sequences
<- pr2_update_all %>%
pr2_new filter(!(genbank_accession %in% pr2$genbank_accession))
<- pr2_new %>%
pr2_genbank_new select(genbank_accession)
4.3 Save the list of new sequences
- Move this file to Roscoff server
saveRDS(pr2_genbank_new, full_path("pr2_genbank_new.rds"))
5 Part II - Run the script PR2 genbank download.R on Roscoff server
6 Part III - Read file with all Genbank info and save to Excel file for import
6.1 Read the metadata files and features files and join by genbank accession
<- read_tsv(full_path("pr2_gb_metadata_2021-03-23.txt"), guess_max = 20000)
pr2_new
<- read_tsv(full_path("pr2_gb_features_2021-03-24.txt"), guess_max = 20000) %>%
pr2_features select(-seqnames) %>%
filter(product %in% c("small subunit ribosomal RNA", "18S ribosomal RNA", "18S small subunit ribosomal RNA") |
%in% c("small subunit ribosomal RNA", "18S ribosomal RNA", "18S small subunit ribosomal RNA")) %>%
note distinct()
<- pr2_new %>%
pr2_new left_join(pr2_features) %>%
mutate(gb_isolate=as.character(gb_isolate)) %>%
distinct()
6.2 Add fields
<- pr2_new %>%
pr2_new mutate(start=case_when(is.na(start) ~ 1,
TRUE ~ start),
end=case_when(is.na(end) ~sequence_length,
TRUE ~ end),
label = "U",
pr2_accession = str_c(genbank_accession,".",start,".", end,"_",label),
reference_sequence = NA,
gene = case_when( str_detect(gb_definition, "nucleomorph") ~ "18S_rRNA",
str_detect(gb_definition, "chloroplast|mitochondr") ~ "16S_rRNA",
TRUE ~ "18S_rRNA"),
organelle = case_when(str_detect(gb_definition, "nucleomorph") ~ "nucleomorph",
str_detect(gb_definition, "chloroplast") ~ "plastid",
str_detect(gb_definition, "mitochondr") ~ "mitochondrion",
TRUE ~ "nucleus"),
sequence = str_sub(gb_sequence, start, end),
sequence_length = str_length(sequence),
ambiguities = str_count(sequence, pattern=pr2.env$ambig_regex),
# Next time remove from here because it is now in the script to run on the server
pr2_sample_type = case_when(str_detect(gb_definition, "uncultured|Uncultured") ~ "environmental",
str_detect(gb_isolation_source, "(?i)water|sediment|ice|plankton|blood|fecal|soil|lake|river|gulf") ~ "environmental",
str_detect(gb_organism, "metagenom") ~ "environmental",
!is.na(gb_specimen_voucher)~ "isolate",
TRUE ~ pr2_sample_type)
)
$sequence_hash = purrr::map_chr(pr2_new$sequence,digest::sha1) pr2_new
6.3 Remove sequences with ambiguities or too short
<- pr2_new %>%
pr2_remove filter((sequence_length < pr2.env$sequence_length_min)|
> pr2.env$sequence_length_max)|
(sequence_length > pr2.env$ambiguities_max)|
(ambiguities str_detect(sequence, pr2.env$sequence_N_repeat)) %>%
arrange(desc(sequence_length)) %>%
select(pr2_accession, sequence_length,ambiguities, gb_definition)
::datatable(pr2_remove, caption = "Sequence to remove - too short, too many ambiguities etc...") DT
<- pr2_new %>%
pr2_new filter(!(pr2_accession %in% pr2_remove$pr2_accession) )
6.3.1 Check duplicate entries
%>%
pr2_new count(pr2_accession) %>%
filter(n > 1)
# A tibble: 0 x 2
# ... with 2 variables: pr2_accession <chr>, n <int>
6.4 Get some statistics
ggplot(pr2_new, aes(x=sequence_length)) +
geom_histogram() +
xlim(0,3000)
6.4.1 Final files for uploading
sprintf(" Final number of sequences added: %d", nrow(pr2_new))
[1] " Final number of sequences added: 8765"
colnames(pr2_new)
[1] "genbank_accession" "gb_id" "gb_project_id" "gb_definition" "gb_organism" "gb_strain" "gb_organelle" "gb_isolate"
[9] "gb_clone" "gb_specimen_voucher" "gb_collected_by" "gb_lat_lon" "gb_note" "gb_culture_collection" "gb_isolation_source" "gb_host"
[17] "gb_environmental_sample" "gb_collection_date" "gb_country" "gb_date" "pr2_longitude" "pr2_latitude" "pr2_sample_type" "gb_sequence"
[25] "sequence_length" "start" "end" "width" "strand" "type" "note" "loctype"
[33] "product" "estimated_length" "label" "pr2_accession" "reference_sequence" "gene" "organelle" "sequence"
[41] "ambiguities" "sequence_hash"
<- pr2_new
pr2_update_new
<- pr2_update_new %>%
pr2_main_new select (pr2_accession, genbank_accession, start,end, label, organelle, gene) %>%
mutate(added_version = pr2.env$version, edited_by = pr2.env$editor)
nrow(pr2_main_new)
[1] 8765
<- pr2_update_new %>%
pr2_sequences_new select(pr2_accession, sequence, sequence_length, ambiguities, sequence_hash)
nrow(pr2_sequences_new)
[1] 8765
<- pr2_update_new %>%
pr2_metadata_new select(genbank_accession, starts_with("gb_"),starts_with("pr2_"), starts_with("eukref_")) %>%
select( -pr2_accession) %>%
distinct()
nrow(pr2_metadata_new)
[1] 8759
7 Save everything to an Excel file
<- full_path("pr2_genbank_import_final.xlsx")
file_pr2_import <- list("pr2_main_new" = pr2_main_new,
onglets "pr2_sequence_new" = pr2_sequences_new,
"pr2_metadata_new" = pr2_metadata_new)
export(onglets, file_pr2_import, zoom = 90, firstRow = TRUE, firstCol = TRUE)