Source code for openalea.phenomenal.tracking.alignment

"""Multiple sequence alignment"""

from copy import deepcopy
import numpy as np



[docs]
def needleman_wunsch(X, Y, gap, gap_extremity_factor=1.0):
    """
    Performs pairwise alignment of profiles X and Y with Needleman-Wunsch algorithm.
    A profile is defined as an array of one or more sequences of the same length.
    Each sequence includes one or several vectors of the same length, and might
    contain gaps (vectors filled with NaN)
    Source code : https://gist.github.com/slowkow/06c6dba9180d013dfd82bec217d22eb5
    The source code was modified to correct a few errors, and adapted to fit all
    requirements (extremity gap, customized scoring function, etc.)

    Parameters
    ----------
    X : array of shape (profile length, sequence length, vector length)
        profile 1
    Y : array of shape (profile length, sequence length, vector length)
        profile 2
    gap : float
        gap penalty parameter
    gap_extremity_factor : float
        optional factor to increase/decrease gap penalty on sequence extremities

    Returns
    -------
    (list, list)
    """

    if X.size == 0 and Y.size == 0:
        rx, ry = [], []
        return rx, ry

    gap_extremity = gap * gap_extremity_factor

    nx = X.shape[1]
    ny = Y.shape[1]

    # Optimal score at each possible pair of characters.
    F = np.zeros((nx + 1, ny + 1))
    F[:, 0] = np.linspace(start=0, stop=-nx * gap_extremity, num=nx + 1)
    F[0, :] = np.linspace(start=0, stop=-ny * gap_extremity, num=ny + 1)

    # Pointers to trace through an optimal alignment.
    P = np.zeros((nx + 1, ny + 1))
    P[:, 0] = 3
    P[0, :] = 4

    # Temporary scores.
    t = np.zeros(3)
    for i in range(nx):
        for j in range(ny):
            # TODO : gap argument should be useless ?
            t[0] = F[i, j] - alignment_score(X[:, i, :], Y[:, j, :], gap_extremity)

            if j + 1 == ny:
                t[1] = F[i, j + 1] - gap_extremity
            else:
                t[1] = F[i, j + 1] - gap

            if i + 1 == nx:
                t[2] = F[i + 1, j] - gap_extremity
            else:
                t[2] = F[i + 1, j] - gap

            tmax = np.max(t)
            F[i + 1, j + 1] = tmax
            if t[0] == tmax:
                P[i + 1, j + 1] += 2
            if t[1] == tmax:
                P[i + 1, j + 1] += 3
            if t[2] == tmax:
                P[i + 1, j + 1] += 4

    # Trace through an optimal alignment.
    i, j = nx, ny

    rx, ry = [], []
    condition = True
    while condition:
        if P[i, j] in [2, 5, 6, 9]:
            rx.append(i - 1)
            ry.append(j - 1)
            i -= 1
            j -= 1
        elif P[i, j] in [3, 5, 7, 9]:
            rx.append(i - 1)
            ry.append(-1)  # gap
            i -= 1
        elif P[i, j] in [4, 6, 7, 9]:
            rx.append(-1)  # gap
            ry.append(j - 1)
            j -= 1

        condition = i > 0 or j > 0

    rx = rx[::-1]
    ry = ry[::-1]

    return rx, ry




[docs]
def scoring_function(vec1, vec2):
    """
    Compute a dissimilarity score between two vectors of same length, which is
    equal to their euclidian distance.

    Parameters
    ----------
    vec1 : 1D array
    vec2 : 1D array, of same length than vec1

    Returns
    -------
    float
    """

    return np.linalg.norm(vec1 - vec2)




[docs]
def alignment_score(x, y, gap_extremity):
    """

    Compute a dissimilarity score between two arrays of vectors x and y.
    x and y can have different lengths, but all vectors in x and y must have
    the same length.

    Parameters
    ----------
    x : 2D array
        size (number of vectors, vector length)
    y : 2D array
        size (number of vectors, vector length)
    gap_extremity : float

    Returns
    -------
    float

    """

    # list of scores for each pair of vectors xvec and yvec,
    # with xvec and yvec being non-gap elements of x and y respectively.
    score = []
    for xvec in x:
        for yvec in y:
            if not all(np.isnan(xvec)) and not all(np.isnan(yvec)):
                score.append(scoring_function(xvec, yvec))

    if score:
        score = np.mean(score)
    else:
        score = gap_extremity  # TODO : hack

    return score




[docs]
def insert_gaps(all_sequences, seq_indexes, alignment):
    """
    Add gaps in sequences of 'all_sequences' whose indexes is in 'seq_indexes'.
    A gap is defined as a NAN array element in a given sequence.
    Gaps positions are given by 'alignment'.

    Parameters
    ----------
    all_sequences : list(2D array)
    seq_indexes : list(int)
    alignment : list(int)
        result from needleman_wunsch()

    Returns
    -------
    """

    all_sequences2 = deepcopy(all_sequences)
    gap_indexes = [i for i, e in enumerate(alignment) if e == -1]

    vec_len = max(len(vec) for seq in all_sequences for vec in seq)

    for si in seq_indexes:
        for gi in gap_indexes:
            if all_sequences2[si].size == 0:
                all_sequences2[si] = np.full((1, vec_len), np.nan)
            else:
                all_sequences2[si] = np.insert(all_sequences2[si], gi, np.nan, 0)

    return all_sequences2




[docs]
def multi_alignment(
    sequences, gap, gap_extremity_factor=1.0, start=0, align_range=None
):
    """
    Multiple sequence alignment algorithm to align n sequences, using a
    progressive method. At each step, a sequence (Y) is aligned with a matrix
    (X) corresponding to a profile (i.e. the alignment of k sequences)
    resulting in the alignment of k + 1 sequences. Each pairwise alignment
    of X vs Y is based on needleman-wunsch algorithm.

    Parameters
    ----------
    sequences : list of 2D arrays
        The list of sequences to align
    gap : float
        penalty parameter to add a gap
    gap_extremity_factor : float
        parameter to modify the gap penalty on sequence extremity positions,
        relatively to gap value.
        For example, if gap = 5 and gap_extremity_factor = 0.6,
        Then the penalty for terminal gaps equals 3.
    start : int
        sequences are progressively added to the global alignment from
        sequences[start] to sequences[0], then from
        sequences[start + 1] to sequences[-1]
    align_range : int
        When adding a new sequence to the global alignment, only the already
        aligned sequences with a distance inferior or equal to this parameter
        in the sequences order are used for the alignment.

    Returns
    -------
    """

    assert -1 <= start <= len(sequences) - 1

    aligned_sequences = deepcopy(sequences)

    # init
    # (k_start -> 0) then (k_start -> n)
    k_start = len(aligned_sequences) - 1 if start == -1 else start
    alignment_order = np.array(
        list(range(0, k_start + 1)[::-1])
        + list(range(k_start + 1, len(aligned_sequences)))
    )
    for k in range(1, len(aligned_sequences)):
        xi = alignment_order[:k]  # ref
        yi = alignment_order[k]

        # select the 2 profiles to align
        xi_in_range = (
            xi
            if align_range is None
            else [val for val in xi if abs(val - k) <= align_range]
        )
        X = np.array([aligned_sequences[i] for i in xi_in_range])
        Y = np.array([aligned_sequences[yi]])

        # alignment
        rx, ry = needleman_wunsch(X, Y, gap, gap_extremity_factor=gap_extremity_factor)

        # update all sequences from sq0 to sq yi
        aligned_sequences = insert_gaps(
            aligned_sequences, xi, rx
        )  # xi = sequences that all have already been aligned
        aligned_sequences = insert_gaps(aligned_sequences, [yi], ry)

    # convert list of aligned sequences (all having the same length) in a matrix of vector indexes (-1 = gap)
    s = np.array(aligned_sequences).shape
    alignment_matrix = np.full((s[0], s[1]), -1)
    for i, aligned_seq in enumerate(aligned_sequences):
        no_gap = np.array([not all(np.isnan(e)) for e in aligned_seq])
        alignment_matrix[i][no_gap] = np.arange(sum(no_gap))

    return alignment_matrix