PR2 version 4.12.0
Misc
1 Major changes
- Sequences with more than 2 consecutives N are removed
- Create hash values in the pr2_sequence database
2 Init
Load the variables common to the different scripts and the necessary libraries
3 Read pr2
- Use the PR2 on local computer
# Info for the local dabase
pr2_db <- db_info("pr2_local")
pr2_db_con <- db_connect(pr2_db)
pr2_main <- tbl(pr2_db_con, "pr2_main") %>% collect()
# pr2_main <- pr2_main %>% filter (is.na(removed_version))
pr2_seq <- tbl(pr2_db_con, "pr2_sequences") %>% collect()
pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy")
pr2_taxo <- pr2_taxo %>% filter(is.na(taxo_removed_version)) %>% collect()
pr2_metadata <- tbl(pr2_db_con, "pr2_metadata") %>% collect()
# Join the tables and keep only sequences that are not removed
pr2 <- left_join(pr2_main, pr2_taxo, by = c(species = "species"))
pr2 <- left_join(pr2, pr2_seq)
pr2 <- left_join(pr2, pr2_metadata)
db_disconnect(pr2_db_con)
pr2 <- pr2 %>% filter(is.na(removed_version))
# pr2_18S <- pr2 %>% filter (is.na(removed_version)) %>% filter(gene == '18S
# rRNA') pr2_18S_removed <- pr2 %>% filter (!is.na(removed_version)) %>%
# filter(gene == '18S rRNA') # Remove pr2 to make sure that we do not use
# the full pr2 remove(pr2)
4 Clean up
4.1 Find sequences that have more than 2 consecutive N (they are problematic for primer analyses)
- 1692 sequences removed
pr2_NNN <- pr2 %>% filter(str_detect(sequence, "NNN"))
pr2_NNN_count <- pr2_NNN %>% group_by(supergroup, division, class) %>% tally()
pr2_NNN <- pr2_NNN %>% mutate(query = str_c("UPDATE pr2_main", " SET ", db_update_field_string("removed_version",
pr2.env$version), db_update_field_string("edited_remark", "Sequence with more than 2 consecutive N"),
db_update_field_string("edited_by", "D. Vaulot", append_comma = FALSE),
" WHERE pr2_accession = ", db_sql_escape(pr2_accession)))
pr2_NNN$query_result <- db_execute_query_vector(db_info("pr2_local"), pr2_NNN$query)
5 Hash values
- Identical sequences with different assignation. However this can be due to the fact that different species have same 18S.
- TO DO - Find contradiction at class level….
5.1 Add hash values to database
pr2$sequence_hash = purrr::map_chr(pr2$sequence, digest::sha1)
pr2 <- pr2 %>% mutate(query = str_c("UPDATE pr2_sequences", " SET ", db_update_field_string("sequence_hash",
sequence_hash, append_comma = FALSE), " WHERE pr2_accession = ", db_sql_escape(pr2_accession)))
pr2$query_result <- db_execute_query_vector(db_info("pr2_local"), pr2$query)
5.2 Find sequences with same hash values but different assignation
hash_contradict <- pr2 %>% group_by(sequence_hash, species) %>% tally() %>%
filter(n > 1) %>% select(-n) %>% group_by(sequence_hash) %>% tally() %>%
filter(n > 1) %>% arrange(desc(n))
pr2_contradict <- pr2 %>% filter(sequence_hash %in% hash_contradict$sequence_hash) %>%
arrange(sequence_hash, genus, species)
write_tsv(pr2_contradict, "../temp/pr2_contradict.tsv", na = "")
5.3 Find sequences with contradiction at class level
- Only 2 sequences due to the fact that genera with same name but from different supergroups.