Counting k-mers via Suffix Array

Pre-requisite: Suffix Array. What are k-mers? The term k-mers typically refers to all the possible substrings of length k that are contained in a string. Counting all the k-mers in DNA/RNA sequencing reads is the preliminary step of many bioinformatics applications. What is a Suffix Array? A suffix array is a sorted array of all suffixes of a string. It is a data structure used, among others, in full text indices, data compression algorithms. More information can be found here. Problem: We are given a string str and an integer k. We have to find all pairs (substr, i) such that substr is a length – k substring of str that occurs exactly i times. 

Steps involved in the approach: 

Let’s take the word “banana$” as an example. 

Step 1: Compute the suffix array of the given text.

          6     $   
5 a$
3 ana$
1 anana$
0 banana$
4 na$
2 nana$

Step 2: Iterate through the suffix array keeping “curr_count”

  1. If the length of current suffix is less than k, then skip the iteration. That is, if k = 2, then iteration would be skipped when current suffix is $
  2. If the current suffix begins with the same length – k substring as the previous suffix, then increment curr_count. For example, during fourth iteration current suffix “anana$” starts with same substring of length k “an” as previous suffix “ana$” started with. So, we will increment curr_count in this case. 
  3. If condition 2 is not satisfied, then if length of previous suffix is equal to k, then that it is a valid pair and we will output it along with its current count, otherwise, we will skip that iteration.
                 curr_count  Valid Pair
6 $ 1
5 a$ 1
3 ana$ 1 (a$, 1)
1 anana$ 1
0 banana$ 2 (an, 2)
4 na$ 1 (ba, 1)
2 nana$ 1 (na, 2)

Examples: 

Input : banana$ // Input text
Output : (a$, 1) // k- mers
(an, 2)
(ba, 1)
(na, 2)

Input : w3wiki
Output : (ee, 2)
(ek, 2)
(fo, 1)
(ge, 2)
(ks, 2)
(or, 1)
(sf, 1)

The following is the C code for approach explained above: 

C++
// C++ program to solve K-mers counting problem
#include <bits/stdc++.h>
using namespace std;

// Structure to store data of a rotation
struct rotation {
    int index;
    char* suffix;
};

// Compares the rotations and
// sorts the rotations alphabetically
int cmpfunc(const void* x, const void* y)
{
    struct rotation* rx = (struct rotation*)x;
    struct rotation* ry = (struct rotation*)y;
    return strcmp(rx->suffix, ry->suffix);
}

// Takes input_text and its length as arguments
// and returns the corresponding suffix array
char** computeSuffixArray(char* input_text, int len_text)
{
    int i;

    // Array of structures to store rotations
    // and their indexes
    struct rotation suff[len_text];

    // Structure is needed to maintain old
    // indexes of rotations after sorting them
    for (i = 0; i < len_text; i++) {
        suff[i].index = i;
        suff[i].suffix = (input_text + i);
    }

    // Sorts rotations using comparison function
    // defined above
    qsort(suff, len_text, sizeof(struct rotation), cmpfunc);

    // Stores the suffixes of sorted rotations
    char** suffix_arr
        = (char**)malloc(len_text * sizeof(char*));

    for (i = 0; i < len_text; i++) {
        suffix_arr[i]
            = (char*)malloc((len_text + 1) * sizeof(char));
        strcpy(suffix_arr[i], suff[i].suffix);
    }

    // Returns the computed suffix array
    return suffix_arr;
}

// Takes suffix array, its size and valid length as
// arguments and outputs the valid pairs of k - mers
void findValidPairs(char** suffix_arr, int n, int k)
{
    int curr_count = 1, i;
    char* prev_suff = (char*)malloc(n * sizeof(char));

    // Iterates over the suffix array,
    // keeping a current count
    for (i = 0; i < n; i++) {

        // Skipping the current suffix
        // if it has length < valid length
        if (strlen(suffix_arr[i]) < k) {

            if (i != 0 && strlen(prev_suff) == k) {
                cout << "(" << prev_suff << ", "
                     << curr_count << ")" << endl;
                curr_count = 1;
            }

            strcpy(prev_suff, suffix_arr[i]);
            continue;
        }

        // Incrementing the curr_count if first
        // k chars of prev_suff and current suffix
        // are same
        if (!(memcmp(prev_suff, suffix_arr[i], k))) {
            curr_count++;
        }
        else {

            // Pair is valid when i!=0 (as there is
            // no prev_suff for i = 0) and when strlen
            // of prev_suff is k
            if (i != 0 && strlen(prev_suff) == k) {
                cout << "(" << prev_suff << ", "
                     << curr_count << ")" << endl;
                curr_count = 1;
                memcpy(prev_suff, suffix_arr[i], k);
                prev_suff[k] = '\0';
            }
            else {
                memcpy(prev_suff, suffix_arr[i], k);
                prev_suff[k] = '\0';
                continue;
            }
        }

        // Modifying prev_suff[i] to current suffix
        memcpy(prev_suff, suffix_arr[i], k);
        prev_suff[k] = '\0';
    }

    // Printing the last valid pair
    cout << "(" << prev_suff << ", " << curr_count << ")"
         << endl;
}

// Driver program to test functions above
int main()
{
    char input_text[] = "w3wiki";
    int k = 2;
    int len_text = strlen(input_text);

    // Computes the suffix array of our text
    cout << "Input Text: " << input_text << endl;
    char** suffix_arr
        = computeSuffixArray(input_text, len_text);

    // Finds and outputs all valid pairs
    cout << "k-mers: " << endl;
    findValidPairs(suffix_arr, len_text, k);

    return 0;
}
C
// C program to solve K-mers counting problem
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// Structure to store data of a rotation
struct rotation {
    int index;
    char* suffix;
};

// Compares the rotations and
// sorts the rotations alphabetically
int cmpfunc(const void* x, const void* y)
{
    struct rotation* rx = (struct rotation*)x;
    struct rotation* ry = (struct rotation*)y;
    return strcmp(rx->suffix, ry->suffix);
}

// Takes input_text and its length as arguments
// and returns the corresponding suffix array
char** computeSuffixArray(char* input_text, int len_text)
{
    int i;

    // Array of structures to store rotations
    // and their indexes
    struct rotation suff[len_text];

    // Structure is needed to maintain old
    // indexes of rotations after sorting them
    for (i = 0; i < len_text; i++) {
        suff[i].index = i;
        suff[i].suffix = (input_text + i);
    }

    // Sorts rotations using comparison function
    // defined above
    qsort(suff, len_text, sizeof(struct rotation), cmpfunc);

    // Stores the suffixes of sorted rotations
    char** suffix_arr
        = (char**)malloc(len_text * sizeof(char*));

    for (i = 0; i < len_text; i++) {
        suffix_arr[i]
            = (char*)malloc((len_text + 1) * sizeof(char));
        strcpy(suffix_arr[i], suff[i].suffix);
    }

    // Returns the computed suffix array
    return suffix_arr;
}

// Takes suffix array, its size and valid length as
// arguments and outputs the valid pairs of k - mers
void findValidPairs(char** suffix_arr, int n, int k)
{
    int curr_count = 1, i;
    char* prev_suff = (char*)malloc(n * sizeof(char));

    // Iterates over the suffix array,
    // keeping a current count
    for (i = 0; i < n; i++) {

        // Skipping the current suffix
        // if it has length < valid length
        if (strlen(suffix_arr[i]) < k) {

            if (i != 0 && strlen(prev_suff) == k) {
                printf("(%s, %d)\n", prev_suff, curr_count);
                curr_count = 1;
            }

            strcpy(prev_suff, suffix_arr[i]);
            continue;
        }

        // Incrementing the curr_count if first
        // k chars of prev_suff and current suffix
        // are same
        if (!(memcmp(prev_suff, suffix_arr[i], k))) {
            curr_count++;
        }
        else {

            // Pair is valid when i!=0 (as there is
            // no prev_suff for i = 0) and when strlen
            // of prev_suff is k
            if (i != 0 && strlen(prev_suff) == k) {
                printf("(%s, %d)\n", prev_suff, curr_count);
                curr_count = 1;
                memcpy(prev_suff, suffix_arr[i], k);
                prev_suff[k] = '\0';
            }
            else {
                memcpy(prev_suff, suffix_arr[i], k);
                prev_suff[k] = '\0';
                continue;
            }
        }

        // Modifying prev_suff[i] to current suffix
        memcpy(prev_suff, suffix_arr[i], k);
        prev_suff[k] = '\0';
    }

    // Printing the last valid pair
    printf("(%s, %d)\n", prev_suff, curr_count);
}

// Driver program to test functions above
int main()
{
    char input_text[] = "w3wiki";
    int k = 2;
    int len_text = strlen(input_text);

    // Computes the suffix array of our text
    printf("Input Text: %s\n", input_text);
    char** suffix_arr
        = computeSuffixArray(input_text, len_text);

    // Finds and outputs all valid pairs
    printf("k-mers: \n");
    findValidPairs(suffix_arr, len_text, k);

    return 0;
}
Java
import java.util.Arrays;

// Java program to solve K-mers counting problem
public class Main {

    // Structure to store data of a rotation
    static class Rotation {
        int index;
        String suffix;
    }

    // Takes input_text and its length as arguments
    // and returns the corresponding suffix array
    static Rotation[] computeSuffixArray(String input_text,
                                         int len_text)
    {
        Rotation[] suff = new Rotation[len_text];

        // Structure is needed to maintain old
        // indexes of rotations after sorting them
        for (int i = 0; i < len_text; i++) {
            suff[i] = new Rotation();
            suff[i].index = i;
            suff[i].suffix = input_text.substring(i);
        }

        // Sorts rotations using comparison function
        // defined above
        Arrays.sort(suff,
                    (a, b) -> a.suffix.compareTo(b.suffix));

        // Returns the computed suffix array
        return suff;
    }

    // Takes suffix array, its size and valid length as
    // arguments and outputs the valid pairs of k - mers
    static void findValidPairs(Rotation[] suffix_arr, int n,
                               int k)
    {
        int curr_count = 1;
        String prev_suff = "";

        for (int i = 0; i < n; i++) {
            if (suffix_arr[i].suffix.length() < k) {
                if (i != 0 && prev_suff.length() == k) {
                    System.out.println("(" + prev_suff
                                       + ", " + curr_count
                                       + ")");
                    curr_count = 1;
                }
                prev_suff = suffix_arr[i].suffix;
                continue;
            }

            if (prev_suff.length() >= k
                && suffix_arr[i].suffix.length() >= k
                && prev_suff.substring(0, k).equals(
                    suffix_arr[i].suffix.substring(0, k))) {
                curr_count++;
            }
            else {
                if (i != 0 && prev_suff.length() == k) {
                    System.out.println("(" + prev_suff
                                       + ", " + curr_count
                                       + ")");
                    curr_count = 1;
                    prev_suff
                        = suffix_arr[i].suffix.length() >= k
                              ? suffix_arr[i]
                                    .suffix.substring(0, k)
                              : suffix_arr[i].suffix;
                }
                else {
                    prev_suff
                        = suffix_arr[i].suffix.length() >= k
                              ? suffix_arr[i]
                                    .suffix.substring(0, k)
                              : suffix_arr[i].suffix;
                    continue;
                }
            }
            prev_suff
                = suffix_arr[i].suffix.length() >= k
                      ? suffix_arr[i].suffix.substring(0, k)
                      : suffix_arr[i].suffix;
        }
        System.out.println("(" + prev_suff + ", "
                           + curr_count + ")");
    }

    // Driver program to test functions above
    public static void main(String[] args)
    {
        String input_text = "w3wiki";
        int k = 2;
        int len_text = input_text.length();

        // Computes the suffix array of our text
        System.out.println("Input Text: " + input_text);
        Rotation[] suffix_arr
            = computeSuffixArray(input_text, len_text);

        // Finds and outputs all valid pairs
        System.out.println("k-mers: ");
        findValidPairs(suffix_arr, len_text, k);
    }
}
Python
# Python program to solve K-mers counting problem

# Structure to store data of a rotation
class Rotation:
    def __init__(self, index, suffix):
        self.index = index
        self.suffix = suffix

# Takes input_text and its length as arguments
# and returns the corresponding suffix array
def compute_suffix_array(input_text, len_text):
    # List is needed to maintain old
    # indexes of rotations after sorting them
    suff = [Rotation(i, input_text[i:]) for i in range(len_text)]

    # Sorts rotations using comparison function
    # defined above
    suff.sort(key=lambda x: x.suffix)

    # Returns the computed suffix array
    return suff

# Takes suffix array, its size and valid length as
# arguments and outputs the valid pairs of k - mers
def find_valid_pairs(suffix_arr, n, k):
    curr_count = 1
    prev_suff = ""

    for i in range(n):
        if len(suffix_arr[i].suffix) < k:
            if i != 0 and len(prev_suff) == k:
                print(f"({prev_suff}, {curr_count})")
                curr_count = 1
            prev_suff = suffix_arr[i].suffix
            continue

        if len(prev_suff) >= k and len(suffix_arr[i].suffix) >= k and prev_suff[:k] == suffix_arr[i].suffix[:k]:
            curr_count += 1
        else:
            if i != 0 and len(prev_suff) == k:
                print(f"({prev_suff}, {curr_count})")
                curr_count = 1
                prev_suff = suffix_arr[i].suffix[:k] if len(suffix_arr[i].suffix) >= k else suffix_arr[i].suffix
            else:
                prev_suff = suffix_arr[i].suffix[:k] if len(suffix_arr[i].suffix) >= k else suffix_arr[i].suffix
                continue

        prev_suff = suffix_arr[i].suffix[:k] if len(suffix_arr[i].suffix) >= k else suffix_arr[i].suffix

    print(f"({prev_suff}, {curr_count})")

# Driver program to test functions above
def main():
    input_text = "w3wiki"
    k = 2
    len_text = len(input_text)

    # Computes the suffix array of our text
    print(f"Input Text: {input_text}")
    suffix_arr = compute_suffix_array(input_text, len_text)

    # Finds and outputs all valid pairs
    print("k-mers: ")
    find_valid_pairs(suffix_arr, len_text, k)

if __name__ == "__main__":
    main()
JavaScript
// Class to represent a rotation
class Rotation {
    constructor(index, suffix) {
        this.index = index;
        this.suffix = suffix;
    }
}

// Function to compute the suffix array
function computeSuffixArray(inputText) {
    const lenText = inputText.length;
    const suff = [];

    // Create rotations and store them in suff array
    for (let i = 0; i < lenText; i++) {
        suff.push(new Rotation(i, inputText.substring(i)));
    }

    // Sort rotations using comparison function
    suff.sort((a, b) => a.suffix.localeCompare(b.suffix));

    return suff;
}

// Function to find and output valid pairs of k-mers
function findValidPairs(suffixArr, k) {
    let currCount = 1;
    let prevSuffix = "";

    for (let i = 0; i < suffixArr.length; i++) {
        const suffix = suffixArr[i].suffix;
        
        if (suffix.length < k) {
            if (i !== 0 && prevSuffix.length === k) {
                console.log(`(${prevSuffix}, ${currCount})`);
                currCount = 1;
            }
            prevSuffix = suffix;
            continue;
        }

         if (prevSuffix.length >= k && suffix.length >= k && 
    prevSuffix.substring(0, k) === suffix.substring(0, k)) {

            currCount++;
        } else {
            if (i !== 0 && prevSuffix.length === k) {
                console.log(`(${prevSuffix}, ${currCount})`);
                currCount = 1;
                prevSuffix = suffix.length >= k ? suffix.substring(0, k) : suffix;
            } else {
                prevSuffix = suffix.length >= k ? suffix.substring(0, k) : suffix;
                continue;
            }
        }
        prevSuffix = suffix.length >= k ? suffix.substring(0, k) : suffix;
    }

    console.log(`(${prevSuffix}, ${currCount})`);
}

// Main function to test the above functions
function main() {
    const inputText = "w3wiki";
    const k = 2;

    console.log("Input Text:", inputText);

    // Computes the suffix array of the input text
    const suffixArr = computeSuffixArray(inputText);

    // Finds and outputs all valid pairs of k-mers
    console.log("k-mers:");
    findValidPairs(suffixArr, k);
}

// Call the main function
main();

Output: 

Input Text: banana$ 
k-mers:
(a$, 1)
(an, 2)
(ba, 1)
(na, 2)

Time Complexity: O(s*len_text*log(len_text)), assuming s is the length of the longest suffix.



Contact Us