def kmer2int(kmer, numerals={'A': 0, 'C': 1, 'G': 2, 'T': 3}):
# reverse numerals so that 1st correspond to the 0th power etc
return sum(numerals[c] * 4**k for k, c in enumerate(reversed(kmer)))
kmer2int("CAT")
19
2022-10-01
> source: https://www.ecseq.com
Alingers differ along 3 main dimensions.
> source: Alser et al. Genome Biology (2021) 22:249. Figure 2
The 2 main technology for index construction are:
Feature | Hashing | BWT-FM |
---|---|---|
Implementation | Easy | Hard |
Key matching | Exact | Exact and Inexact |
Index Size | Large | Compressed |
Index time | Fast | Slow |
Query Time | Fast | Slow |
Seed Length | Fixed | Variable |
source: Alser et al. Genome Biology (2021) 22:249. Table 2
source: Wikipedia Hash Table
SimpleKmerHash
class SimpleKmerHash:
def __init__(self, k):
self.k = k
self._alphabet = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
self._array = [None] * 4**k # hold space
def kmer2int(self, kmer):
assert len(kmer) == self.k
numerals = [self._alphabet[c] for c in reversed(kmer)]
return sum(d * 4**i for i, d in enumerate(numerals))
def __getitem__(self, kmer):
i = self.kmer2int(kmer)
return self._array[i]
def __setitem__(self, kmer, value):
i = self.kmer2int(kmer)
self._array[i] = value
def __contains__(self, kmer):
i = self.kmer2int(kmer)
return self._array[i] is not None
p, t = "CAT", "TTGTGTGCATGTTGTTTCATCATTTAGAGATACATTGCGCTGCATCATGGTAG"
# 01234567890123456789012345678901234567890123456789012
# Occurrences: * * * * * *
hashTable = SimpleKmerHash(k = 3)
for i, kmer in enumerate(iter_words(t, k = 3)):
if kmer in hashTable:
hashTable[kmer].append(i)
else:
hashTable[kmer] = [i]
hashTable[p]
[7, 17, 20, 32, 42, 45]
Benefits:
For a single seed, with probability of sequencing error \(p_e\):
For multiple seeds \(|S| \gt 1\) with \(k' \gt k\):
\[\text{FDR} = \frac{|S|}{4^{k'}}\]
Representative set of k-mers that can be be used to locate other \(k-mer\) with which they have a significant overlap.
For every \(w\) consecutive (shifted by 1) \(k\)-mers pick as the “smallest” of them according to some fixed ordering
Subread introduced seed-and-vote.
Some aligners aim to guarantee perfect matching under some assumptions about the distribution of errors. They make use of the pigeonhole principle or the q-gram lemma.
An alignment of length \(L\) with \(e\) errors has at least \(L + 1 - q(e+1)\) q-seed with no errors.
class SmarterIndex:
def __init__(self, array):
# python sort tuples by 0th element
self._array = sorted((a, i) for i, a in enumerate(array))
def _bisect_left(self, x, low=0, high=None):
if high is None:
high = len(self._array)
while low < high:
mid = (low + high) // 2
if self._array[mid][0] < x:
low = mid + 1
else:
high = mid
return low
def _bisect_right(self, x, low=0, high=None):
if high is None:
high = len(self._array)
while low < high:
mid = (low + high) // 2
if x < self._array[mid][0]:
high = mid
else:
low = mid + 1
return low
def __getitem__(self, x):
array = self._array
i = self._bisect_left(x)
if i >= len(array) or array[i][0] != x:
return []
j = self._bisect_right(x, i)
return [a[1] for a in array[i:j]]
def __contains__(self, x):
i = self._bisect_left(x)
return i < len(self._array) and self._array[i][0] == x
@property
def array(self):
"""Returns the original array by sorting by (the original) index"""
yield from (a[0] for a in sorted(self._array, key=lambda a: a[1]))
\(k\)-mers are truncated suffixes we can extend them to get the full suffix
abaaba
------
aba
baa
aab
aba
abaaba
------
abaaba
baaba
aaba
aba
ba
a
sa | suffix
-- | -------
6 | $
5 | a$
2 | aaba$
3 | aba$
0 | abaaba$
4 | ba$
1 | baaba$
SAIndex
Every pattern that exists in text
will appear as the beginning of a suffix. So we can use the suffix array as index!
class SAIndex:
def __init__(self, text):
self.text = text
self.sa = build_suffix_array(text)
self.n = len(self.sa)
def _bisect_left(self, p, low=0, high=None):
if high is None:
high = self.n
while low < high:
mid = (low + high) // 2
i = self.sa[mid]
if self.text[i:i+len(p)] < p:
low = mid + 1
else:
high = mid
return low
def _bisect_right(self, p, low=0, high=None):
if high is None:
high = self.n-1
while low < high:
mid = (low + high) // 2
i = self.sa[mid]
if p < self.text[i:i+len(p)]:
high = mid
else:
low = mid + 1
return low
def match(self, p, i):
if i < 0 or i > len(self.text) - len(p):
return False
return self.text[i:i+len(p)] == p
def __getitem__(self, p):
i = self._bisect_left(p)
if i >= self.n or not self.match(p, self.sa[i]):
return []
j = self._bisect_right(p, i)
return self.sa[i:j] # sort if you want them in order of appearance
def __contains__(self, p):
i = self._bisect_left(p)
return self.match(p, self.sa[i])
http://www.cs.jhu.edu/~langmea/resources/lecture_notes/220_bwt_intro_boost_pub.pdf
https://qr.ae/pvaJcA
text[i:]
)$
It renders all the rotations unique (breaks symmetries)
Key BWT properties:
i
(text[:i]
)bwt[i:i+len(p)] == p
) since the text is permutedGiven a query, we work in reverse (because BWT precedes the suffix):
rotation[i] = text[i:] + text[:i]
BWM[i, 0] = text[i]
BWM[i, -1] = text[i-1] = BWT[i]
\[\text{LF}(x_i) = F(x) + i \]
where:
sa
to map to genomesa
and effective ranks can take up a lot of space. We can use check-points and LF
steps.class LFCheckpoints:
@staticmethod
def effective_ranks(bwt):
ckps = {}
N = len(bwt)
# record appearances
for i, c in enumerate(bwt):
if c not in ckps:
ckps[c] = [0] * N
ckps[c][i] = 1
# get count up to position i
for c, occ in ckps.items():
for i in range(1, N):
occ[i] += occ[i-1] # cumsum
ckps[c] = occ
return ckps
@staticmethod
def ranks2rle(ranks):
first, row = {}, 0
for c, ckp in sorted(ranks.items()):
first[c] = row
row += ckp[-1] # add total tally
return first
def __init__(self, bwt):
self.N = len(bwt)
self.rank = self.effective_ranks(bwt)
self.first = self.ranks2rle(self.rank)
def __call__(self, c, i):
if i < 0:
return self.first[c]
i = min(i, self.N - 1)
return self.first[c] + self.rank[c][i]
class FMIndex:
@staticmethod
def BWT(T):
# sa = sort i by the suffix it defines
sa = sorted(range(len(T)), key = lambda i: T[i:])
bwt = ''.join(T[i - 1] for i in sa)
return bwt, sa
def __init__(self, T):
if T[-1] != '$':
T += '$'
self.bwt, self.sa = self.BWT(T)
self.LF = LFCheckpoints(self.bwt)
def _match(self, p):
low, high = 0, len(self.bwt)
for c in reversed(p):
low = self.LF(c, low - 1) # counts up to but not including low (so 1st low has index 0)
high = self.LF(c, high) - 1 # counts up to high. -1 to turn it into index
if low > high:
break
return low, high + 1 # +1 to work with slise notation
def __contains__(self, p):
low, high = self._match(p)
return low < high
def __getitem__(self, p):
low, high = self._match(p)
if low > high:
return []
return [self.sa[i] for i in range(low, high)]
BWAIndex/
:
genome.amb
: ambiguous? coordinates of non ATCG
charactersgenome.ann
: annotation? info about contigsgenome.bwt
genome.pac
: compressed version of genome (2bits/base)genome.sa
Also usage:
A Maximal Exact Match (MEM) is a subsequence of the read that is present in the reference genome (exact) and that cannot be extended further (maximal), either because the read ends or because the extended subsequence is not in the genome.
They are an example of variable-sized seeds and are calculated via the FM index.
A read can be aligned to multiple positions we need to quantify our confidence in each.
\[P_M(u \mid G, r) \propto P(r|G_u^{u+l}) P(u, l)\]
Usually (but not always!) the best alignment is reported: \(\hat{u} = \arg\max_u P_M(u \mid G, r)\)
MAPQ is a quality score: \(Q_M = -10 \log_{10} P(\hat{u} \neq u_{\star})\)
For long (> 1kbp) reads some of these assumptions do not hold anymore like:
These error profiles necessitate the use of a local alignment (SW) since we cannot bound the possible errors a-priori and perform a simple backtrack.
The reference genome is just 1 individual. Human genome differ by 0.1% ⇒ systematic mismatches
Variance aware references:
N
at known polymorphic position.