DNA_Markov_Chain.py

"""
Author: James Roberts
Last Updated: 10/20/2022

This program makes markov chains of DNA sequences and finds
the probability the chain was generated by Model1
or Model2. This program shows how using a simpe likelihood
estimate has potential for error when classifying.

EX:
coming from nucleotide A to Nucleotide T is 0.120 in M1
and 0.210 in M2.
"""

import random

# Models (CpG Islands)
        #A      #G    #C      #T
M1 = [[0.180, 0.426, 0.274, 0.120], #A
      [0.161, 0.375, 0.339, 0.125], #G
      [0.171, 0.274, 0.367, 0.188], #C
      [0.079, 0.384, 0.355, 0.182]] #T


        #A      #G    #C      #T
M2 = [[0.300, 0.285, 0.205, 0.210], #A
      [0.248, 0.298, 0.246, 0.208], #G
      [0.322, 0.078, 0.298, 0.302], #C
      [0.177, 0.292, 0.239, 0.292]] #T


# You can update these hyperparameters
length_of_sequence = 9
number_of_sequences = 10_000


def markov_chain(length_of_sequence,
                 number_of_sequences,
                 model,     #Model to simulate
                 model1,
                 model2):

    """
    This function takes 2 models and a model to
    generate the random sequences. It returns the
    number of sequences that likely came from model 1
    and the number of sequences that likely came
    from model 2.
    """

    # Nucleotides
    NUCLEOTIDES = ['A', 'G', 'C', 'T']

    sequences = []
    prob_sequence_M1 = []
    prob_sequence_M2 = []

    sequences_morelikely_M1 = 0
    sequences_morelikely_M2 = 0

    i = 0
    while i != number_of_sequences:

        starting_sequence = random.choice(NUCLEOTIDES)

        sequence = [starting_sequence]

        # Starting probability is 0.25 for each nucleotide
        prob_M1 = 0.25
        prob_M2 = 0.25

        j = 0
        while j != (length_of_sequence-1):

            if sequence[j] == 'A':
                next = random.choices(NUCLEOTIDES,
                                      weights = model[0],
                                      k=1)[0]

                sequence.append(next[0])

                if next == 'A':
                    prob_M1 = prob_M1 * model1[0][0]
                    prob_M2 = prob_M2 * model2[0][0]

                elif next == 'G':
                    prob_M1 = prob_M1 * model1[0][1]
                    prob_M2 = prob_M2 * model2[0][1]

                elif next == 'C':
                    prob_M1 = prob_M1 * model1[0][2]
                    prob_M2 = prob_M2 * model2[0][2]

                elif next == 'T':
                    prob_M1 = prob_M1 * model1[0][3]
                    prob_M2 = prob_M2 * model2[0][3]

            elif sequence[j] == 'G':
                next = random.choices(NUCLEOTIDES,
                                      weights = model[1],
                                      k=1)[0]

                sequence.append(next[0])

                if next == 'A':
                    prob_M1 *= model1[1][0]
                    prob_M2 *= model2[1][0]

                elif next == 'G':
                    prob_M1 *= model1[1][1]
                    prob_M2 *= model2[1][1]

                elif next == 'C':
                    prob_M1 *= model1[1][2]
                    prob_M2 *= model2[1][2]

                elif next == 'T':
                    prob_M1 *= model1[1][3]
                    prob_M2 *= model2[1][3]

            elif sequence[j] == 'C':
                next = random.choices(NUCLEOTIDES,
                                      weights = model[2],
                                      k=1)[0]

                sequence.append(next[0])

                if next == 'A':
                    prob_M1 *= model1[2][0]
                    prob_M2 *= model2[2][0]

                elif next == 'G':
                    prob_M1 *= model1[2][1]
                    prob_M2 *= model2[2][1]

                elif next == 'C':
                    prob_M1 *= model1[2][2]
                    prob_M2 *= model2[2][2]

                elif next == 'T':
                    prob_M1 *= model1[2][3]
                    prob_M2 *= model2[2][3]

            elif sequence[j] == 'T':
                next = random.choices(NUCLEOTIDES,
                                      weights = model[3],
                                      k=1)[0]

                sequence.append(next[0])

                if next == 'A':
                    prob_M1 *= model1[3][0]
                    prob_M2 *= model2[3][0]

                elif next == 'G':
                    prob_M1 *= model1[3][1]
                    prob_M2 *= model2[3][1]

                elif next == 'C':
                    prob_M1 *= model1[3][2]
                    prob_M2 *= model2[3][2]

                elif next == 'T':
                    prob_M1 *= model1[3][3]
                    prob_M2 *= model2[3][3]

            j += 1

        if prob_M1 > prob_M2:
            sequences_morelikely_M1 += 1
        elif prob_M2 > prob_M1:
            sequences_morelikely_M2 += 1

        i += 1

    return sequences_morelikely_M1, sequences_morelikely_M2


#####################################
# a)
print('a)')

A = markov_chain(length_of_sequence = length_of_sequence,
                 number_of_sequences = number_of_sequences,
                 model = M1,     #Model to simulate
                 model1 = M1,
                 model2 = M2)

print('Sequences Generated by Model 1 weights')
print('Number of sequences more likely to be from Model 1: {:}'.format(A[0]))
print('Number of sequences more likely to be from Model 2: {:}'.format(A[1]))
print()

#####################################
# b)
print('b)')

A = markov_chain(length_of_sequence = length_of_sequence,
                 number_of_sequences = number_of_sequences,
                 model = M2,     #Model to simulate
                 model1 = M1,
                 model2 = M2)

print('Sequences Generated by Model 2 weights')
print('Number of sequences more likely to be from Model 1: {:}'.format(A[0]))
print('Number of sequences more likely to be from Model 2: {:}'.format(A[1]))