Counting k-mers via Suffix Array
Pre-requisite: Suffix Array. What are k-mers? The term k-mers typically refers to all the possible substrings of length k that are contained in a string. Counting all the k-mers in DNA/RNA sequencing reads is the preliminary step of many bioinformatics applications. What is a Suffix Array? A suffix array is a sorted array of all suffixes of a string. It is a data structure used, among others, in full text indices, data compression algorithms. More information can be found here. Problem: We are given a string str and an integer k. We have to find all pairs (substr, i) such that substr is a length – k substring of str that occurs exactly i times.
Steps involved in the approach:
Let’s take the word “banana$” as an example.
Step 1: Compute the suffix array of the given text.
6 $
5 a$
3 ana$
1 anana$
0 banana$
4 na$
2 nana$
Step 2: Iterate through the suffix array keeping “curr_count”.
- If the length of current suffix is less than k, then skip the iteration. That is, if k = 2, then iteration would be skipped when current suffix is $.
- If the current suffix begins with the same length – k substring as the previous suffix, then increment curr_count. For example, during fourth iteration current suffix “anana$” starts with same substring of length k “an” as previous suffix “ana$” started with. So, we will increment curr_count in this case.
- If condition 2 is not satisfied, then if length of previous suffix is equal to k, then that it is a valid pair and we will output it along with its current count, otherwise, we will skip that iteration.
curr_count Valid Pair
6 $ 1
5 a$ 1
3 ana$ 1 (a$, 1)
1 anana$ 1
0 banana$ 2 (an, 2)
4 na$ 1 (ba, 1)
2 nana$ 1 (na, 2)
Examples:
Input : banana$ // Input text
Output : (a$, 1) // k- mers
(an, 2)
(ba, 1)
(na, 2)
Input : w3wiki
Output : (ee, 2)
(ek, 2)
(fo, 1)
(ge, 2)
(ks, 2)
(or, 1)
(sf, 1)
The following is the C code for approach explained above:
// C++ program to solve K-mers counting problem
#include <bits/stdc++.h>
using namespace std;
// Structure to store data of a rotation
struct rotation {
int index;
char* suffix;
};
// Compares the rotations and
// sorts the rotations alphabetically
int cmpfunc(const void* x, const void* y)
{
struct rotation* rx = (struct rotation*)x;
struct rotation* ry = (struct rotation*)y;
return strcmp(rx->suffix, ry->suffix);
}
// Takes input_text and its length as arguments
// and returns the corresponding suffix array
char** computeSuffixArray(char* input_text, int len_text)
{
int i;
// Array of structures to store rotations
// and their indexes
struct rotation suff[len_text];
// Structure is needed to maintain old
// indexes of rotations after sorting them
for (i = 0; i < len_text; i++) {
suff[i].index = i;
suff[i].suffix = (input_text + i);
}
// Sorts rotations using comparison function
// defined above
qsort(suff, len_text, sizeof(struct rotation), cmpfunc);
// Stores the suffixes of sorted rotations
char** suffix_arr
= (char**)malloc(len_text * sizeof(char*));
for (i = 0; i < len_text; i++) {
suffix_arr[i]
= (char*)malloc((len_text + 1) * sizeof(char));
strcpy(suffix_arr[i], suff[i].suffix);
}
// Returns the computed suffix array
return suffix_arr;
}
// Takes suffix array, its size and valid length as
// arguments and outputs the valid pairs of k - mers
void findValidPairs(char** suffix_arr, int n, int k)
{
int curr_count = 1, i;
char* prev_suff = (char*)malloc(n * sizeof(char));
// Iterates over the suffix array,
// keeping a current count
for (i = 0; i < n; i++) {
// Skipping the current suffix
// if it has length < valid length
if (strlen(suffix_arr[i]) < k) {
if (i != 0 && strlen(prev_suff) == k) {
cout << "(" << prev_suff << ", "
<< curr_count << ")" << endl;
curr_count = 1;
}
strcpy(prev_suff, suffix_arr[i]);
continue;
}
// Incrementing the curr_count if first
// k chars of prev_suff and current suffix
// are same
if (!(memcmp(prev_suff, suffix_arr[i], k))) {
curr_count++;
}
else {
// Pair is valid when i!=0 (as there is
// no prev_suff for i = 0) and when strlen
// of prev_suff is k
if (i != 0 && strlen(prev_suff) == k) {
cout << "(" << prev_suff << ", "
<< curr_count << ")" << endl;
curr_count = 1;
memcpy(prev_suff, suffix_arr[i], k);
prev_suff[k] = '\0';
}
else {
memcpy(prev_suff, suffix_arr[i], k);
prev_suff[k] = '\0';
continue;
}
}
// Modifying prev_suff[i] to current suffix
memcpy(prev_suff, suffix_arr[i], k);
prev_suff[k] = '\0';
}
// Printing the last valid pair
cout << "(" << prev_suff << ", " << curr_count << ")"
<< endl;
}
// Driver program to test functions above
int main()
{
char input_text[] = "w3wiki";
int k = 2;
int len_text = strlen(input_text);
// Computes the suffix array of our text
cout << "Input Text: " << input_text << endl;
char** suffix_arr
= computeSuffixArray(input_text, len_text);
// Finds and outputs all valid pairs
cout << "k-mers: " << endl;
findValidPairs(suffix_arr, len_text, k);
return 0;
}
// C program to solve K-mers counting problem
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// Structure to store data of a rotation
struct rotation {
int index;
char* suffix;
};
// Compares the rotations and
// sorts the rotations alphabetically
int cmpfunc(const void* x, const void* y)
{
struct rotation* rx = (struct rotation*)x;
struct rotation* ry = (struct rotation*)y;
return strcmp(rx->suffix, ry->suffix);
}
// Takes input_text and its length as arguments
// and returns the corresponding suffix array
char** computeSuffixArray(char* input_text, int len_text)
{
int i;
// Array of structures to store rotations
// and their indexes
struct rotation suff[len_text];
// Structure is needed to maintain old
// indexes of rotations after sorting them
for (i = 0; i < len_text; i++) {
suff[i].index = i;
suff[i].suffix = (input_text + i);
}
// Sorts rotations using comparison function
// defined above
qsort(suff, len_text, sizeof(struct rotation), cmpfunc);
// Stores the suffixes of sorted rotations
char** suffix_arr
= (char**)malloc(len_text * sizeof(char*));
for (i = 0; i < len_text; i++) {
suffix_arr[i]
= (char*)malloc((len_text + 1) * sizeof(char));
strcpy(suffix_arr[i], suff[i].suffix);
}
// Returns the computed suffix array
return suffix_arr;
}
// Takes suffix array, its size and valid length as
// arguments and outputs the valid pairs of k - mers
void findValidPairs(char** suffix_arr, int n, int k)
{
int curr_count = 1, i;
char* prev_suff = (char*)malloc(n * sizeof(char));
// Iterates over the suffix array,
// keeping a current count
for (i = 0; i < n; i++) {
// Skipping the current suffix
// if it has length < valid length
if (strlen(suffix_arr[i]) < k) {
if (i != 0 && strlen(prev_suff) == k) {
printf("(%s, %d)\n", prev_suff, curr_count);
curr_count = 1;
}
strcpy(prev_suff, suffix_arr[i]);
continue;
}
// Incrementing the curr_count if first
// k chars of prev_suff and current suffix
// are same
if (!(memcmp(prev_suff, suffix_arr[i], k))) {
curr_count++;
}
else {
// Pair is valid when i!=0 (as there is
// no prev_suff for i = 0) and when strlen
// of prev_suff is k
if (i != 0 && strlen(prev_suff) == k) {
printf("(%s, %d)\n", prev_suff, curr_count);
curr_count = 1;
memcpy(prev_suff, suffix_arr[i], k);
prev_suff[k] = '\0';
}
else {
memcpy(prev_suff, suffix_arr[i], k);
prev_suff[k] = '\0';
continue;
}
}
// Modifying prev_suff[i] to current suffix
memcpy(prev_suff, suffix_arr[i], k);
prev_suff[k] = '\0';
}
// Printing the last valid pair
printf("(%s, %d)\n", prev_suff, curr_count);
}
// Driver program to test functions above
int main()
{
char input_text[] = "w3wiki";
int k = 2;
int len_text = strlen(input_text);
// Computes the suffix array of our text
printf("Input Text: %s\n", input_text);
char** suffix_arr
= computeSuffixArray(input_text, len_text);
// Finds and outputs all valid pairs
printf("k-mers: \n");
findValidPairs(suffix_arr, len_text, k);
return 0;
}
import java.util.Arrays;
// Java program to solve K-mers counting problem
public class Main {
// Structure to store data of a rotation
static class Rotation {
int index;
String suffix;
}
// Takes input_text and its length as arguments
// and returns the corresponding suffix array
static Rotation[] computeSuffixArray(String input_text,
int len_text)
{
Rotation[] suff = new Rotation[len_text];
// Structure is needed to maintain old
// indexes of rotations after sorting them
for (int i = 0; i < len_text; i++) {
suff[i] = new Rotation();
suff[i].index = i;
suff[i].suffix = input_text.substring(i);
}
// Sorts rotations using comparison function
// defined above
Arrays.sort(suff,
(a, b) -> a.suffix.compareTo(b.suffix));
// Returns the computed suffix array
return suff;
}
// Takes suffix array, its size and valid length as
// arguments and outputs the valid pairs of k - mers
static void findValidPairs(Rotation[] suffix_arr, int n,
int k)
{
int curr_count = 1;
String prev_suff = "";
for (int i = 0; i < n; i++) {
if (suffix_arr[i].suffix.length() < k) {
if (i != 0 && prev_suff.length() == k) {
System.out.println("(" + prev_suff
+ ", " + curr_count
+ ")");
curr_count = 1;
}
prev_suff = suffix_arr[i].suffix;
continue;
}
if (prev_suff.length() >= k
&& suffix_arr[i].suffix.length() >= k
&& prev_suff.substring(0, k).equals(
suffix_arr[i].suffix.substring(0, k))) {
curr_count++;
}
else {
if (i != 0 && prev_suff.length() == k) {
System.out.println("(" + prev_suff
+ ", " + curr_count
+ ")");
curr_count = 1;
prev_suff
= suffix_arr[i].suffix.length() >= k
? suffix_arr[i]
.suffix.substring(0, k)
: suffix_arr[i].suffix;
}
else {
prev_suff
= suffix_arr[i].suffix.length() >= k
? suffix_arr[i]
.suffix.substring(0, k)
: suffix_arr[i].suffix;
continue;
}
}
prev_suff
= suffix_arr[i].suffix.length() >= k
? suffix_arr[i].suffix.substring(0, k)
: suffix_arr[i].suffix;
}
System.out.println("(" + prev_suff + ", "
+ curr_count + ")");
}
// Driver program to test functions above
public static void main(String[] args)
{
String input_text = "w3wiki";
int k = 2;
int len_text = input_text.length();
// Computes the suffix array of our text
System.out.println("Input Text: " + input_text);
Rotation[] suffix_arr
= computeSuffixArray(input_text, len_text);
// Finds and outputs all valid pairs
System.out.println("k-mers: ");
findValidPairs(suffix_arr, len_text, k);
}
}
# Python program to solve K-mers counting problem
# Structure to store data of a rotation
class Rotation:
def __init__(self, index, suffix):
self.index = index
self.suffix = suffix
# Takes input_text and its length as arguments
# and returns the corresponding suffix array
def compute_suffix_array(input_text, len_text):
# List is needed to maintain old
# indexes of rotations after sorting them
suff = [Rotation(i, input_text[i:]) for i in range(len_text)]
# Sorts rotations using comparison function
# defined above
suff.sort(key=lambda x: x.suffix)
# Returns the computed suffix array
return suff
# Takes suffix array, its size and valid length as
# arguments and outputs the valid pairs of k - mers
def find_valid_pairs(suffix_arr, n, k):
curr_count = 1
prev_suff = ""
for i in range(n):
if len(suffix_arr[i].suffix) < k:
if i != 0 and len(prev_suff) == k:
print(f"({prev_suff}, {curr_count})")
curr_count = 1
prev_suff = suffix_arr[i].suffix
continue
if len(prev_suff) >= k and len(suffix_arr[i].suffix) >= k and prev_suff[:k] == suffix_arr[i].suffix[:k]:
curr_count += 1
else:
if i != 0 and len(prev_suff) == k:
print(f"({prev_suff}, {curr_count})")
curr_count = 1
prev_suff = suffix_arr[i].suffix[:k] if len(suffix_arr[i].suffix) >= k else suffix_arr[i].suffix
else:
prev_suff = suffix_arr[i].suffix[:k] if len(suffix_arr[i].suffix) >= k else suffix_arr[i].suffix
continue
prev_suff = suffix_arr[i].suffix[:k] if len(suffix_arr[i].suffix) >= k else suffix_arr[i].suffix
print(f"({prev_suff}, {curr_count})")
# Driver program to test functions above
def main():
input_text = "w3wiki"
k = 2
len_text = len(input_text)
# Computes the suffix array of our text
print(f"Input Text: {input_text}")
suffix_arr = compute_suffix_array(input_text, len_text)
# Finds and outputs all valid pairs
print("k-mers: ")
find_valid_pairs(suffix_arr, len_text, k)
if __name__ == "__main__":
main()
// Class to represent a rotation
class Rotation {
constructor(index, suffix) {
this.index = index;
this.suffix = suffix;
}
}
// Function to compute the suffix array
function computeSuffixArray(inputText) {
const lenText = inputText.length;
const suff = [];
// Create rotations and store them in suff array
for (let i = 0; i < lenText; i++) {
suff.push(new Rotation(i, inputText.substring(i)));
}
// Sort rotations using comparison function
suff.sort((a, b) => a.suffix.localeCompare(b.suffix));
return suff;
}
// Function to find and output valid pairs of k-mers
function findValidPairs(suffixArr, k) {
let currCount = 1;
let prevSuffix = "";
for (let i = 0; i < suffixArr.length; i++) {
const suffix = suffixArr[i].suffix;
if (suffix.length < k) {
if (i !== 0 && prevSuffix.length === k) {
console.log(`(${prevSuffix}, ${currCount})`);
currCount = 1;
}
prevSuffix = suffix;
continue;
}
if (prevSuffix.length >= k && suffix.length >= k &&
prevSuffix.substring(0, k) === suffix.substring(0, k)) {
currCount++;
} else {
if (i !== 0 && prevSuffix.length === k) {
console.log(`(${prevSuffix}, ${currCount})`);
currCount = 1;
prevSuffix = suffix.length >= k ? suffix.substring(0, k) : suffix;
} else {
prevSuffix = suffix.length >= k ? suffix.substring(0, k) : suffix;
continue;
}
}
prevSuffix = suffix.length >= k ? suffix.substring(0, k) : suffix;
}
console.log(`(${prevSuffix}, ${currCount})`);
}
// Main function to test the above functions
function main() {
const inputText = "w3wiki";
const k = 2;
console.log("Input Text:", inputText);
// Computes the suffix array of the input text
const suffixArr = computeSuffixArray(inputText);
// Finds and outputs all valid pairs of k-mers
console.log("k-mers:");
findValidPairs(suffixArr, k);
}
// Call the main function
main();
Output:
Input Text: banana$
k-mers:
(a$, 1)
(an, 2)
(ba, 1)
(na, 2)
Time Complexity: O(s*len_text*log(len_text)), assuming s is the length of the longest suffix.
Contact Us