PR2
Misaligned sequences Fred - issue # 26
1 Init
source('../PR2_init.R', echo=FALSE)
source('../PR2_read_google.R', echo=FALSE)
2 Set up the files
<- "Misaligned Fred"
dir_pr2_update
$editor <- "F. Mahé"
pr2.env
<- function(file_name){str_c(dir_pr2_update,"/", file_name)} full_path
3 Round # 1 - Fred
While working with cutadapt (v3.4) to extract the SSU V4 region, I’ve noticed that some entries are not in the expected orientation (compared to the reference Saccharomyces cerevisiae). I wrote a script to search for all possible orientations:
## primers from Stoeck et al. 2010
ORIGINAL_PRIMER_F="CCAGCASCYGCGGTAATTCC"
ORIGINAL_PRIMER_R="ACTTTCGTTCTTGATYRA"
complement() {
# complement a DNA/RNA IUPAC string
-z "${1}" ]] && { echo "error: empty string" ; exit 1 ; }
[[ local -r nucleotides="acgturykmbdhvswACGTURYKMBDHVSW"
local -r complements="tgcaayrmkvhdbswTGCAAYRMKVHDBSW"
tr "${nucleotides}" "${complements}" <<< "${1}"
}
trim_primers() {
# trim and keep only entries with both primers
OPTIONS="--minimum-length ${MIN_LENGTH} --discard-untrimmed --error-rate ${ERROR_RATE} --quiet"
CUTADAPT="$(which cutadapt) ${OPTIONS}"
MIN_LENGTH="32"
ERROR_RATE="0.2"
MIN_F=$(( ${#PRIMER_F} * 2 / 3 ))
MIN_R=$(( ${#PRIMER_R} * 2 / 3 ))
zcat "${SOURCE}.gz" | \
dos2unix | \
${CUTADAPT} \
--front "${PRIMER_F}" \
--overlap "${MIN_F}" - | \
${CUTADAPT} \
--adapter "${PRIMER_R}" \
--overlap "${MIN_R}" - > "${OUTPUT}"
}
## download PR2 (UTAX version)
URL="https://github.com/pr2database/pr2database/releases/download"
VERSION="4.13.0"
SOURCE="pr2_version_${VERSION}_18S_UTAX.fasta"
[[ -e "${SOURCE}.gz" ]] || wget "${URL}/v${VERSION}/${SOURCE}.gz"
## search rev-comp entries
OUTPUT="revcomp.fas"
PRIMER_F="${ORIGINAL_PRIMER_R}"
PRIMER_R="$(complement "${ORIGINAL_PRIMER_F}" | rev)"
trim_primers
## search reverse entries
OUTPUT="reverse.fas"
PRIMER_F="$(complement "${ORIGINAL_PRIMER_R}")"
PRIMER_R="$(rev <<< "${ORIGINAL_PRIMER_F}")"
trim_primers
## search complement entries
OUTPUT="complement.fas"
PRIMER_F="$(complement "${ORIGINAL_PRIMER_F}")"
PRIMER_R="$(rev <<< "${ORIGINAL_PRIMER_R}")"
trim_primers
The script allows 20% of errors in primer matching but forces a match greater or equal to 2/3rd of the primer length. That parameter should limit false positives.
The results are:
115 possible case of ‘reverse-complement’ sequences (a frequent mistake), 30 possible cases of ‘complement’ sequences (rare, but I’ve seen ‘complement’ 18S entries on GenBank), 1 possible case of ‘reverse’ sequence I join the corresponding fasta files: misoriented_entries.zip
3.1 Sequences in bad orientations
Total 146 sequences
- comp 26
- revcomp 114 + 189
- remove 6
3.2 Update
<- fasta_read(full_path("complement.fas")) %>%
comp mutate(to_do = "comp")
<- fasta_read(full_path("revcomp.fas"))%>%
revcomp mutate(to_do = "revcomp")
<- fasta_read(full_path("reverse.fas"))%>%
rev mutate(to_do = "rev")
<- bind_rows(list(comp, revcomp, rev)) %>%
update select(-sequence) %>%
::separate(seq_name, into = c("pr2_accession"), sep = ";") %>%
tidyrleft_join(pr2_main, by = "pr2_accession") %>%
left_join(pr2_seq, by = "pr2_accession") %>%
mutate( sequence_new = case_when(to_do == "revcomp" ~ seq_reverse(seq_complement(dna(sequence))),
== "comp" ~ seq_complement(dna(sequence)),
to_do == "rev" ~ seq_complement(dna(sequence))
to_do
),to_do = case_when(str_detect(pr2_accession, "JL712519|AF542044|AY223840|AY237160|KF747308|HQ660595") ~"remove",
TRUE ~ to_do) ,
start_new = case_when(to_do == "revcomp" ~ end,
TRUE ~ start),
end_new = case_when(to_do == "revcomp" ~ start,
TRUE ~ end),
pr2_accession_new = case_when(to_do == "revcomp" ~ str_c(genbank_accession, ".", start_new, ".", end_new, "_", label),
TRUE ~ pr2_accession),
removed_version = case_when(to_do == "remove" ~ pr2.env$version,
TRUE ~ removed_version),
edited_version = ifelse(to_do != "remove" , str_append(edited_version, pr2.env$version), edited_version),
edited_by = str_append(edited_by, pr2.env$editor),
edited_remark = case_when(to_do == "remove" ~ str_append(edited_remark, "bad sequence"),
== "revcomp" ~ str_append(edited_remark, "sequence reverse complemented"),
to_do == "comp" ~ str_append(edited_remark, "sequence complemented"),
to_do TRUE ~ edited_remark) ,
sequence_new = as.character(sequence_new),
sequence_length = str_length(sequence_new),
ambiguities = str_count(sequence, pattern=pr2.env$ambig_regex)
)
$sequence_hash = purrr::map_chr(update$sequence_new,digest::sha1)
update
%>%count(to_do)
update
<- update %>%
pr2_main_update select(pr2_main_id, pr2_accession_new, start_new, end_new, removed_version, edited_version, edited_by, edited_remark)
<- update %>%
pr2_sequence_update filter(to_do != "remove") %>%
select(pr2_accession_new, sequence_new, sequence_length, sequence_hash, ambiguities)
export(list(pr2_main_update, pr2_sequence_update), full_path("pr2_update_orientation.xlsx"))
4 Round 2 - Fred
I though about an alternative approach that does not require primers, but is limited to finding reverse-complement entries:
PR2="pr2_version_4.13.0_18S_UTAX.fasta.gz"
# with vsearch 2.17 or more recent
vsearch \
--orient "${PR2}" \
--db "${PR2}" \
--quiet \
--tabbedout - | \
awk '$2 == "-"'
Forward oriented sequences: 178470 (99.82%) Reverse oriented sequences: 303 (0.17%) All oriented sequences: 178773 (99.99%) Not oriented sequences: 21 (0.01%) Total number of sequences: 178794 Here are the entries identified as likely reverse-complement when compared to the others: rev_comp.txt. The number of false-positives could be high though, so maybe there is no need to act on these sequences right now?
4.1 Read entries
<- import(full_path("revcomp_2.txt"), format = "tsv", sep = "\t", header = FALSE) %>%
revcomp_accessions separate(col = "V1", into = "pr2_accession", sep = ";", extra = "drop") %>%
left_join(select(pr2, pr2_main_id, pr2_accession, genbank_accession, label, kingdom:species,
%>%
sequence, start, end, edited_version, edited_by, edited_remark)) mutate(seq_name = pr2_accession) %>%
filter(!is.na(sequence))
fasta_write(revcomp_accessions, full_path("revcomp_2.fasta"), taxo_include = TRUE)
# export(revcomp_accessions, full_path("revcomp_2.xlsx"))
4.2 Examine BLAST against PR2
All sequences show more reverse fit that forward fit. One has to remove one set of sequences which were put together in random order….
<- import(full_path("revcomp_2.xlsx"), sheet = "BLAST") %>%
revcomp_blast filter(!str_detect(Name, "HG795")) %>% # Remove sequences that are in bad orientation
mutate(orientation = ifelse(Hit_start > Hit_end, "REV", "FWD")) %>%
count(Query, orientation) %>%
pivot_wider(names_from = orientation, values_from = n) %>%
mutate(FWD = str_replace_na(FWD, 0)) %>%
mutate(orientation = ifelse(REV > FWD, "REV", "FWD")) %>%
arrange(Query)
::glue("Minimum number of reverse sequences: {min(revcomp_blast$REV)}")
glue
%>% filter(orientation == "FWD") revcomp_blast
5 Update database
<- revcomp_accessions %>%
update mutate(sequence_new = as.character(seq_reverse(seq_complement(dna(sequence)))),
start_new = end,
end_new = start,
pr2_accession_new = str_c(genbank_accession, ".", start_new, ".", end_new, "_", label),
edited_version = str_append(edited_version, pr2.env$version),
edited_by = str_append(edited_by, pr2.env$editor),
edited_remark = str_append(edited_remark, "sequence reverse complemented"),
sequence_length = str_length(sequence_new),
ambiguities = str_count(sequence, pattern=pr2.env$ambig_regex)
)
$sequence_hash = purrr::map_chr(update$sequence_new,digest::sha1)
update
<- update %>%
pr2_main_update select(pr2_main_id, pr2_accession = pr2_accession_new, star = start_new, end = end_new,
edited_version, edited_by, edited_remark)
<- update %>%
pr2_sequence_update select(pr2_accession = pr2_accession_new, sequence = sequence_new, sequence_hash,
sequence_length , ambiguities)
export(list(pr2_main_update, pr2_sequence_update), full_path("pr2_update_orientation_2.xlsx"))