PR2
Misaligned sequences Fred - issue # 26

1 Init

  source('../PR2_init.R', echo=FALSE)
  source('../PR2_read_google.R', echo=FALSE)

2 Set up the files

  dir_pr2_update <- "Misaligned Fred"
  
  pr2.env$editor <- "F. Mahé"

  full_path <- function(file_name){str_c(dir_pr2_update,"/", file_name)}

3 Round # 1 - Fred

While working with cutadapt (v3.4) to extract the SSU V4 region, I’ve noticed that some entries are not in the expected orientation (compared to the reference Saccharomyces cerevisiae). I wrote a script to search for all possible orientations:


## primers from Stoeck et al. 2010
ORIGINAL_PRIMER_F="CCAGCASCYGCGGTAATTCC"
ORIGINAL_PRIMER_R="ACTTTCGTTCTTGATYRA"


complement() {
    # complement a DNA/RNA IUPAC string
    [[ -z "${1}" ]] && { echo "error: empty string" ; exit 1 ; }
    local -r nucleotides="acgturykmbdhvswACGTURYKMBDHVSW"
    local -r complements="tgcaayrmkvhdbswTGCAAYRMKVHDBSW"
    tr "${nucleotides}" "${complements}" <<< "${1}"
}


trim_primers() {
    # trim and keep only entries with both primers
    OPTIONS="--minimum-length ${MIN_LENGTH} --discard-untrimmed --error-rate ${ERROR_RATE} --quiet"
    CUTADAPT="$(which cutadapt) ${OPTIONS}"
    MIN_LENGTH="32"
    ERROR_RATE="0.2"
    MIN_F=$(( ${#PRIMER_F} * 2 / 3 ))
    MIN_R=$(( ${#PRIMER_R} * 2 / 3 ))

    zcat "${SOURCE}.gz" | \
        dos2unix | \
        ${CUTADAPT} \
            --front "${PRIMER_F}" \
            --overlap "${MIN_F}" - | \
        ${CUTADAPT} \
            --adapter "${PRIMER_R}" \
            --overlap "${MIN_R}" - > "${OUTPUT}"
}

## download PR2 (UTAX version)
URL="https://github.com/pr2database/pr2database/releases/download"
VERSION="4.13.0"
SOURCE="pr2_version_${VERSION}_18S_UTAX.fasta"
[[ -e "${SOURCE}.gz" ]] || wget "${URL}/v${VERSION}/${SOURCE}.gz"


## search rev-comp entries
OUTPUT="revcomp.fas"
PRIMER_F="${ORIGINAL_PRIMER_R}"
PRIMER_R="$(complement "${ORIGINAL_PRIMER_F}" | rev)"
trim_primers


## search reverse entries
OUTPUT="reverse.fas"
PRIMER_F="$(complement "${ORIGINAL_PRIMER_R}")"
PRIMER_R="$(rev <<< "${ORIGINAL_PRIMER_F}")"
trim_primers


## search complement entries
OUTPUT="complement.fas"
PRIMER_F="$(complement "${ORIGINAL_PRIMER_F}")"
PRIMER_R="$(rev <<< "${ORIGINAL_PRIMER_R}")"
trim_primers

The script allows 20% of errors in primer matching but forces a match greater or equal to 2/3rd of the primer length. That parameter should limit false positives.

The results are:

115 possible case of ‘reverse-complement’ sequences (a frequent mistake), 30 possible cases of ‘complement’ sequences (rare, but I’ve seen ‘complement’ 18S entries on GenBank), 1 possible case of ‘reverse’ sequence I join the corresponding fasta files: misoriented_entries.zip

3.1 Sequences in bad orientations

Total 146 sequences

comp 26
revcomp 114 + 189
remove 6

3.2 Update

comp <- fasta_read(full_path("complement.fas")) %>% 
  mutate(to_do = "comp")
revcomp <- fasta_read(full_path("revcomp.fas"))%>% 
  mutate(to_do = "revcomp")
rev <- fasta_read(full_path("reverse.fas"))%>% 
  mutate(to_do = "rev")

update <- bind_rows(list(comp, revcomp, rev)) %>% 
  select(-sequence) %>% 
  tidyr::separate(seq_name, into = c("pr2_accession"), sep = ";") %>% 
  left_join(pr2_main, by = "pr2_accession") %>% 
  left_join(pr2_seq, by = "pr2_accession") %>% 
  mutate( sequence_new = case_when(to_do == "revcomp" ~ seq_reverse(seq_complement(dna(sequence))),
                                  to_do == "comp" ~ seq_complement(dna(sequence)),
                                  to_do == "rev" ~ seq_complement(dna(sequence))
                                  ),
          to_do = case_when(str_detect(pr2_accession, "JL712519|AF542044|AY223840|AY237160|KF747308|HQ660595") ~"remove", 
                         TRUE ~ to_do) ,
          start_new = case_when(to_do == "revcomp" ~ end,
                                TRUE ~ start),
          end_new = case_when(to_do == "revcomp" ~ start,
                              TRUE ~ end),
          pr2_accession_new = case_when(to_do == "revcomp" ~ str_c(genbank_accession, ".", start_new, ".", end_new, "_", label),
                                        TRUE ~ pr2_accession),
          removed_version = case_when(to_do == "remove" ~ pr2.env$version,
                                        TRUE ~ removed_version),
          edited_version = ifelse(to_do != "remove" , str_append(edited_version, pr2.env$version), edited_version),         
          edited_by = str_append(edited_by, pr2.env$editor), 
          edited_remark = case_when(to_do == "remove" ~ str_append(edited_remark, "bad sequence"),
                                    to_do == "revcomp" ~ str_append(edited_remark, "sequence reverse complemented"),
                                    to_do == "comp" ~ str_append(edited_remark, "sequence complemented"),
                                    TRUE ~ edited_remark) ,
          sequence_new = as.character(sequence_new),
          sequence_length = str_length(sequence_new),
          ambiguities = str_count(sequence, pattern=pr2.env$ambig_regex)
          ) 
         
 update$sequence_hash = purrr::map_chr(update$sequence_new,digest::sha1)

update %>%count(to_do)

pr2_main_update <- update %>% 
  select(pr2_main_id, pr2_accession_new, start_new, end_new, removed_version, edited_version, edited_by, edited_remark)

pr2_sequence_update <- update %>% 
  filter(to_do != "remove") %>% 
  select(pr2_accession_new, sequence_new, sequence_length, sequence_hash, ambiguities)

export(list(pr2_main_update, pr2_sequence_update), full_path("pr2_update_orientation.xlsx"))

4 Round 2 - Fred

I though about an alternative approach that does not require primers, but is limited to finding reverse-complement entries:

  PR2="pr2_version_4.13.0_18S_UTAX.fasta.gz"  
  
  # with vsearch 2.17 or more recent
  vsearch \
      --orient "${PR2}" \
      --db "${PR2}" \
      --quiet \
      --tabbedout - | \
      awk '$2 == "-"'

Forward oriented sequences: 178470 (99.82%) Reverse oriented sequences: 303 (0.17%) All oriented sequences: 178773 (99.99%) Not oriented sequences: 21 (0.01%) Total number of sequences: 178794 Here are the entries identified as likely reverse-complement when compared to the others: rev_comp.txt. The number of false-positives could be high though, so maybe there is no need to act on these sequences right now?

4.1 Read entries

revcomp_accessions <- import(full_path("revcomp_2.txt"), format = "tsv", sep = "\t", header = FALSE) %>% 
  separate(col = "V1", into = "pr2_accession", sep = ";", extra = "drop") %>% 
  left_join(select(pr2, pr2_main_id, pr2_accession, genbank_accession, label, kingdom:species, 
                   sequence, start, end, edited_version, edited_by, edited_remark)) %>% 
  mutate(seq_name = pr2_accession) %>% 
  filter(!is.na(sequence))

fasta_write(revcomp_accessions, full_path("revcomp_2.fasta"), taxo_include = TRUE)

# export(revcomp_accessions,  full_path("revcomp_2.xlsx"))

4.2 Examine BLAST against PR2

All sequences show more reverse fit that forward fit. One has to remove one set of sequences which were put together in random order….

revcomp_blast <- import(full_path("revcomp_2.xlsx"), sheet = "BLAST") %>%
  filter(!str_detect(Name, "HG795")) %>%  # Remove sequences that are in bad orientation
  mutate(orientation = ifelse(Hit_start > Hit_end, "REV", "FWD")) %>% 
  count(Query, orientation) %>% 
  pivot_wider(names_from = orientation, values_from = n) %>% 
  mutate(FWD = str_replace_na(FWD, 0)) %>% 
  mutate(orientation = ifelse(REV > FWD, "REV", "FWD")) %>% 
  arrange(Query)

glue::glue("Minimum number of reverse sequences: {min(revcomp_blast$REV)}")
  
revcomp_blast %>% filter(orientation == "FWD")

5 Update database

update <-  revcomp_accessions %>% 
  mutate(sequence_new = as.character(seq_reverse(seq_complement(dna(sequence)))),
          start_new = end,
          end_new = start,
          pr2_accession_new = str_c(genbank_accession, ".", start_new, ".", end_new, "_", label),
          edited_version = str_append(edited_version, pr2.env$version),         
          edited_by = str_append(edited_by, pr2.env$editor), 
          edited_remark = str_append(edited_remark, "sequence reverse complemented"),
          sequence_length = str_length(sequence_new),
          ambiguities = str_count(sequence, pattern=pr2.env$ambig_regex)
          ) 
         
 update$sequence_hash = purrr::map_chr(update$sequence_new,digest::sha1)


pr2_main_update <- update %>% 
  select(pr2_main_id, pr2_accession = pr2_accession_new, star = start_new, end = end_new, 
         edited_version, edited_by, edited_remark)

pr2_sequence_update <- update %>% 
  select(pr2_accession = pr2_accession_new, sequence = sequence_new, sequence_hash, 
         sequence_length , ambiguities)

export(list(pr2_main_update, pr2_sequence_update), full_path("pr2_update_orientation_2.xlsx"))

PR2 Misaligned sequences Fred - issue # 26