With the hope to help the students of the team with the design of primer, I tried to develop a software to do primer design. There is still work to do on it, but I think what I did could help some others students with the goal to write a software like this ! They could use it as a draft or as a page with links to resources.
You can click on this button to get all the code I wrote in Python :
import math
#If you want to move to see the principal section of the code, please use the tag KEYSECTION
#For the flaws that I don't know of to fix but that are probably not a big deal, use #TODO
#from https://www.sigmaaldrich.com/technical-documents/articles/biology/oligos-melting-temp.html
# Tm = melting temperature in °C
# ΔH = enthalpy change in kcal mol-1 (accounts for the energy change during annealing / melting)
# A = constant of -0.0108 kcal K-1 ᐧ mol-1 (accounts for helix initiation during annealing / melting)
A = -0.0108
# ΔS = entropy change in kcal K-1 ᐧ mol-1 (accounts for energy unable to do work, i.e. disorder)
# R = gas constant of 0.00199 kcal K-1 ᐧ mol-1 (constant that scales energy to temperature)
R = 0.001987
#molar gaz constant, in cal/Celsius*mol)
#R = 1987
# C = oligonucleotide concentration in M or mol L-1 (we use 0.0000005, i.e. 0.5 µM)
# -273.15 = conversion factor to change the expected temperature in Kelvins to °C
# [Na+] = sodium ion concentration in M or mol L-1 (we use 0.05, i.e. 50 mM)
Na = 0.05
#dimensionless (molar) concentration
#gamma = 0.0000005
gamma = 50*10**(-9)
#from Kelvin to Celsius
T0 = - 273.15
#empirical temperature correction in Celsius
t = -21.6
listOfConstraints = ["bpLen","TmNN", "countRuns", "clampGC", "selfComplementarity", "findRestric"]
listOfConstraintsPair = ["TmDiff", "paForwardreverse", "paTargetF", "paTargetR"]
listOfEnzymes = ["XbaI", "EcoRI", "SpeI", "NotI", "PstI"]
dicoEnzymes = {}
dicoEnzymes["XbaI"] = "TCTAGA"
dicoEnzymes["EcoRI"] = "GAATTC"
dicoEnzymes["SpeI"] = "ACTAGT"
dicoEnzymes["NotI"] = "GCGGCCGC"
dicoEnzymes["PstI"] = "CTGCAG"
def palindrome(x):
l = len(x) - 1
for i in range(l):
if(x[i] != x[l-i]):
return False
return True
# ************ FUNCTIONS TO CALCULATE THE MELTING TEMPERATURE **********
def delta_H_S(p1,p2):
"""
A function that returns the delta_H (free enthalpy in kcal/mol) and the delta_S (entropy in cal/K) of a pair of nuclotides, using the tables from Breslauer (1986). Please, refer the ressources for references and #TODO
Parameters
----------
p1 : str
a one-letter string ("A", "T", "C" or "G") to represent the first nucleotide
p2 : str
a one-letter string to represent the second nucleotide of the pair
"""
p3 = str(p1) + str(p2) #concatenate the two nucleotides
if(p3 == "AA" or p3 =="TT"):
H = 9.1
S = 24.0
elif(p3 == "AT"):
H = 8.6
S = 23.9
elif(p3 == "TA"):
H = 6.0
S = 16.9
elif(p3 == "CA" or p3 == "TG"):
H = 5.8
S = 12.9
elif(p3 == "GT" or p3 == "AC"):
H = 6.5
S = 17.3
elif(p3 == "CT" or p3 == "AG"):
H = 7.8
S = 20.8
elif(p3 == "GA" or p3 == "TC"):
H = 5.6
S = 13.5
elif(p3 == "CG"):
H = 11.9
S = 27.8
elif(p3 == "GC"):
H = 11.1
S = 26.7
elif(p3 == "GG" or p3 == "CC"):
H = 11.0
S = 26.6
#this is working for oligo
#return H, S/1000
return H, S
def sum_deltas_H_S(x):
"""
A function that returns the total enthalpy (Htot) and entropy (S) of a sequence of DNA, summing the delta_enthalpy (h) and delta_entropy (s) of each pair of nucleotides. For example, to calculate H of "AATTGCCG" we add up h of "AA", h of "AT", h of "TT", ...
Parameters
----------
x : str
the sequence of DNA from who enthalpy and entropy is calculated
"""
H, S = 0,0
for i in range(0, len(x)-1):
h, s = delta_H_S(x[i],x[i+1])
#print("h = ", h, "s = ", s)
H = H + h
S = S + s
#print("H = ", H, "S = ", S)
return H, S
def TmNN(x):
"""
A function that return the melting temperature of a sequence using the Nearest-Neighbour Method.
Parameters
----------
x : str
the sequence of DNA whom melting temperature is calculated
"""
x = x.upper()
H, S = sum_deltas_H_S(x)
#print("H = ", H, "A = ", A, "S = ",S,"R = ", R, "gamma = ",gamma, "T0 = ", T0, "Na = ", Na)
#Formule from oligon
Tm = (-H/(A-S/1000+R*(math.log(gamma/4))))+ T0 + 16.6*math.log10(Na)
#Formule from the prog article
#Tm = (H/(S+R*(math.log(gamma/4))))+ T0 + t
#Formula from http://www.premierbiosoft.com/tech_notes/PCR_Primer_Design.html , with salt correction
#N = len(x)-1
#Tm = (-H/((-S/1000+0.368*N*math.log(Na))+R*math.log(gamma/4)))+T0
return Tm
def TmBasic(x):
x = x.upper()
counts = {
"A" : 0,
"T" : 0,
"C" : 0,
"G" : 0
}
for let in counts:
for nuc in x:
if(nuc == let):
counts[let] += 1
return 2*(counts["A"]+counts["T"]) + 4*(counts["G"] + counts["C"])
def TmDiff(p,q):
"""
A function that takes as entry two primers (p and q) and return the difference between their melting temperatures"""
return abs(TmNN(p)-TmNN(q))
def Tanneal(p,q):
"""Take in entry two primers/ actually I would probably need a class with in it total sequence and decided primers/ and returns the recommanded annealing temperature #TODO"""
return 0
#KEYSECTION
#******************** USEFUL **********************
def listlist_ToList(ll):
"""This function takes in entry a list containing lists - and return a single big list containing the inside of all the inside lists. For example [[3,5,2],[4,2,3]] -> [3,5,2,4,2,3]"""
main = ll[0]
for i in range(1,len(ll)):
for item in ll[i]:
main.append(item)
return main
#KEYSECTION
#********************GC CONSTRAINT **********************
def percentGC (seqDNA):
"""A function that takes in entry a sequence of DNA and returns it's GC content"""
countGC = 0
for nucleotide in seqDNA.upper():
if nucleotide == 'G' or nucleotide == 'C':
countGC += 1
return (countGC/len(seqDNA))*100
def clampGC (seqDNA):
"""A function that takes in entry a sequence of DNA and returns it's number of G or C nucleotides in the last 5 nucleotides"""
if(len(seqDNA) >= 5):
last5 = seqDNA[-5:]
countGC = 0
for i in last5:
if(i == "G" or i == "C"):
countGC += 1
return countGC
else:
raise Exception("Your sequence of DNA has",len(seqDNA),"base pairs - it must be at least 5 base pairs long to calculate the GC clamp")
#KEYSECTION
#********************RUN AND REPEAT CONSTRAINTS **********************
#TODO : I think the design of count2nuc is quite poor - I am sure there would be a way to do the same thing in a more readable way. I feel like there even would be the possibility to combine count2nuc and countnuc in one function (like take for parameters seqDNA and nnuc -and use len(nnuc) for)
def count2nuc(seqDNA, nuc1, nuc2):
"""
This is a function that takes in entry a sequence of DNA and a dinucleotide (nuc1 + nuc2); and returns the number and the length of uninterrupted repeats of this dinucleotide in the sequence in the form of a list (for ex, count2nuc("GATATGGCCATATAT", "A","T") returns [0,2,0,0,0,3])
Parameters
----------
seqDNA : str
The sequence of DNA whose repetitive dinucleotide will be counted
nuc1 : str
a one-letter string ("A", "T", "C" or "G") to represent the first nucleotide
nuc2 : str
a one-letter string to represent the second nucleotide of the pair
"""
listOfRepeats = []
nuc3 = nuc1 + nuc2 #get the dinucleotide
count = 0; #count is the variable to count the number of repeats
i = 0 #i give the count down and the index
while i < len(seqDNA)-1: #I didn't use a for boucle, since I need to have a variable step for i
if(seqDNA[i:i+2] == nuc3): #the slice is [i:i+2] to get 2 letters - [i:i] returns only 1
count = count + 1
i = i + 2 #when there is a repeat, then you should advance of two letters in the sequence so you don't count ATAT like AT then TA then AT
else: #when there is no repeats
i = i + 1
listOfRepeats.append(count)
count = 0 #there is an interruption in the repeats
listOfRepeats.append(count)#so the last count gets append too
return listOfRepeats
def countRepeats(seqDNA):
"""This is a function that takes in entry a sequence of DNA and returns a dictionnary containing the length of the uninterrupted repeats for each 24 dinucleotides """
lnuc = ["A","T","C","G"]
dictOfList = {}
for m in lnuc:
for n in lnuc:
if m != n: #this is for the countRun function
dictOfList[m+n] = count2nuc(seqDNA,m,n)
return dictOfList
def countnuc(seqDNA, nuc):
"""This is a function that takes in entry a sequence of DNA and a nucleotide and returns the length of the runs of this nucleotide"""
run = 0
listOfRuns = []
for i in range(0, len(seqDNA)):
if(seqDNA[i] == nuc):
run = run + 1
else: #run interrupted
listOfRuns.append(run)
run = 0
listOfRuns.append(run)
return listOfRuns
def countRunsSeparated(seqDNA):
"""This is a function that takes in entry a sequence of DNA and returns a dictionnary containing the length of the uninterrupted repeats for each 4 nucleotides"""
lnuc = ["A","T","C","G"]
dictOfList = {}
for m in lnuc:
dictOfList[m] = countnuc(seqDNA,m)
return dictOfList
def countRuns(seqDNA):
"""This is a function that takes in entry a sequence of DNA and returns a list containing the length of the uninterrupted repeats for each 4 nucleotides"""
lnuc = ["A","T","C","G"]
listOfruns = []
for m in lnuc:
listOfruns.append(countnuc(seqDNA,m))
runs = listlist_ToList(listOfruns)
return max(runs)
#********************RESTRICTION SITE**********************
def findRestric(seqDNA):
"""This is a function that takes in entry a sequence of DNA and returns the number of restrictions site this sequence contains"""
restric = 0
for enzyme in listOfEnzymes:
if dicoEnzymes[enzyme] in seqDNA:
restric = restric + 1
return restric
def addRestric():
"""This is a function to allow the user to add his own restriction site detector (restriction site that shouldn't be contained)"""
enz = input("Please enter the name of the enzyme : ")
site = input("Please enter the restriction site :")
listOfEnzymes.append(enz)
dicoEnzymes[str(enz)] = str(site)
#********************COMPLEMENTARITY AND SELF-COMPLEMENTARY***************
#all the algorithms in this section are based on this article : Thomas Kämpke, Markus Kieninger, Michael Mecklenburg, Efficient primer design algorithms , Bioinformatics, Volume 17, Issue 3, March 2001, Pages 214–225, https://doi.org/10.1093/bioinformatics/17.3.214
def s(a,b):
"""return the score of bonding of two nucleotides (it's stronger for GC since it has 3 bonds of hydrogen and AT has only 2)"""
if (a == "A" and b == "T") or (a =="T" and b == "A"):
return 2
elif (a == "C" and b == "G") or (a == "G" and b == "C"):
return 4
else:
return 0
def S(x,y):
"""this function returns the complementarity (higher score, higher probability to bind) between two sequences"""
#this takes in account the different possible alignement. Ex for x= CAGT and y = CAGT, we have for example those 2 differents alignements:
#C - T C - G
#A - o A - T
#G - o G - o
#T - o T - o
x = x.upper()
y = y.upper()
align = []
if(len(x) > len(y)): #it's easier to code if x is always smaller than y
x,y = y,x
n,m = len(x), len(y)
for i in range(m-1):
y = "o" + y + "o" #to not provoke an error
for k in range(m-1,3*(m-1)+1):
tot = 0
#print("For km = ", k-2*(m-1), "soit k = ", k)
for i2 in range(0, n):
#print(x[i2], y[k-i2])
tot = tot + s(x[i2],y[k-i2])
#print("-> ", tot)
align.append(tot)
#print(align)
return align
def Sprime(x,y):
"""this function returns the number and strengh of bonds (higher score, higher probability to bind)between two sequences in an ininterrupted way from the start in different alignements"""
#the same code than for S except that you break if there is an interruption in the bond making
x = x.upper()
y = y.upper()
align = []
if(len(x) > len(y)):
temp = x
x = y
y = temp
n,m = len(x), len(y)
for i in range(m-1):
y = "o" + y + "o"
for k in range(m-1,3*(m-1)+1):
tot = 0
print("For km = ", k-2*(m-1), "soit k = ", k)
for i2 in range(0, n):
bond = s(x[i2],y[k-i2])
print(x[i2], y[k-i2])
print("k-i2 = ", k-i2)
if(bond == 0 and y[k-i2] != 'o'): #the first bond between the two sequences has to score more than 0
break
tot = tot + bond
print("-> ", tot)
align.append(tot)
#print(align)
return align
def pa(p,q):
"""so pair annealing"""
return max(S(p,q))
def pea(p,q):
"""so pair-end annealing"""
return max(Sprime(p,q))
#why is that working like that, who knows
def sea(seqDNA):
"""self-end annealing"""
return max(Sprime(seqDNA[::-1],seqDNA))
def selfComplementarity(seqDNA):
"""self annealing"""
return max(S(seqDNA, seqDNA))
#********************DISTANCE FROM THE GENE******************
#KEYSECTION
#******************SYSTEM OF SCORE*******************
#KEY SYSTEM : We enter the table containing the keys points, and the computer calculates the intermediates for us. There are two interpolations mode available : constant (fillTableConstant) and linear (fillTableLinearly). For example, on constant mode, if I enter keyX = [1,5,13] and keyY = [10,52,0] that mean at 1 on the abscisses I want 10, at 5 I want 52 and at 13, I want 0. So on constant mode, I would get a list containing four 10 (5-1) and eight (13-5) 52. On linear mode, I would get [10, 20.5, 31.0, 41.5, 52, 45.5, 39.0, 32.5, 26.0, 19.5, 13.0, 6.5]
#TODO: I feel like making [1,2,3,4,5,6,..] like range_x or table_x in the followings, is a bit of a stupid thing to do - I guess there is a better, more readable way, but well the following seems to be working
# def fillRange(xstart, xend, yvalue):
# """This function return a range of (xend - xstart) length containing the yvalue"""
# range_x = []
# range_y = []
# for i in range(xstart, xend):
# range_x.append(i)
# range_y.append(yvalue)
# return range_x, range_y
# def fillTableConstant(keyX, keyY):
# """This function takes in entry a list of keyPoints of coordinates (keyX[i], keyY[i]) and fill the in between with constant values"""
# tableX = []
# tableY = []
# n = len(keyX)
# if(keyX == sorted(keyX) and n == len(keyY)):
# for i in range(0, n-1, 1):
# range_x, range_y = fillRange(keyX[i], keyX[i+1], keyY[i])
# tableX.append(range_x)
# tableY.append(range_y)
# return listlist_ToList(tableX), listlist_ToList(tableY)
def fillTableLinearly(keyX, keyY):
"""This function takes in entry a list of keyPoints of coordinates (keyX[i], keyY[i]) and fill the in between with linear values (ex to go from 12 to 16 in three points, it's 12, 14,16)""" #TODO : Actually, there is an error here - the value we get are 12, 13.333 and 14.666. It's good enough for what I intend to do (fill in between the keys points) but it does not do exactly what I want
tableX = []
tableY = []
n = len(keyX)
if(keyX == sorted(keyX) and n == len(keyY)):
for i in range(0,n-1,1):
xs = keyX[i] #xs for xstart
xe = keyX[i+1] #xend
ys = keyY[i] #ystart
ye = keyY[i+1] #yend
steps = (ye - ys)/(xe-xs) #the rate increase
x = xs
y = ys
for i2 in range(xs, xe,1):
tableX.append(x)
tableY.append(y)
x = x + 1
y = y + steps
return tableX, tableY
#KEYSECTION
#******************DATA FOR PENALTIES*******************
#Table system
class ScoreTable:
"""
A class that contains the data (obtained by the table system) to count the penalties for the primer itself: you enter the constraint (for ex, base pair) and it returns its penalties table
Argument
--------
constraint : str
one of the constraint for an individual primer (like its length, it's melting temperature,...)
Method
------
coordFromKey : function
A function that returns the graph from the key points
"""
def __init__(self, constraint):
self.constraint = constraint
if(constraint == "bpLen"):
self.x_key = [0,10,14,17,23,36,45,100]
self.y_key = [100,30,10,0,5,15,30,100]
self.x, self.y = self.coordFromKey(self.x_key,self.y_key)
elif(constraint == "TmNN"):
self.x_key = [0,20,45,52,58,65,200,201]
self.y_key = [350,100,30,0,30,50,300,500]
self.x, self.y = self.coordFromKey(self.x_key, self.y_key)
elif(constraint == "percentGC"):
self.x_key = [0,20,40,61,81]
self.y_key = [50,30,0,30,50]
self.x, self.y = self.coordFromKey(self.x_key, self.y_key)
elif(constraint == "countRuns"):
self.x_key = [0,5,11]
self.y_key = [0,20,50]
self.x, self.y = self.coordFromKey(self.x_key, self.y_key)
elif(constraint == "clampGC"):
self.x_key = [0,3,5,6]
self.y_key = [20,0,20,0]
self.x, self.y = self.coordFromKey(self.x_key, self.y_key)
elif(constraint == "selfComplementarity"):
#or self-end
self.x_key = [0,1,6,11,100]
self.y_key = [0,5,20,30,50]
self.x, self.y = self.coordFromKey(self.x_key, self.y_key)
def coordFromKey(self,x_key, y_key):
#self.x_ll, self.y_ll = fillTable(self.x_key,self.y_key)
#self.x, self.y = listlist_ToList(self.x_ll), listlist_ToList(self.y_ll)
self.x, self.y = fillTableLinearly(self.x_key, self.y_key)
return self.x,self.y
class ScoreTablePairs:
"""
A class that contains the data (obtained by the table system) to count the incompatibilities between a reverse and a forward primer: you enter the constraint (for ex, the melting temperature difference) and it returns its penalties table
Argument
--------
constraint : str
one of the constraint for the primer that involve something else than itself (like it's complementarity with the target site, the melting temperature with the other primer...)
Method
------
coordFromKey : function
A function that returns the graph from the key points
"""
def __init__(self, constraint):
if(constraint == "TmDiff"):
self.x_key = [0,3,5,11,50]
self.y_key = [0,5,50,500,1000]
self.x, self.y = self.coordFromKey(self.x_key, self.y_key)
elif(constraint == "paForwardreverse"):
self.x_key = [0,1,6,11,21,200]
self.y_key = [0,5,20,30,50,500]
self.x, self.y = self.coordFromKey(self.x_key, self.y_key)
elif(constraint == "paTargetF"):
self.x_key = [0,1,6,11,21,200]
self.y_key = [0,5,20,30,50,500]
self.x, self.y = self.coordFromKey(self.x_key, self.y_key)
elif(constraint == "paTargetR"):
self.x_key = [0,1,6,11,21,200]
self.y_key = [0,5,20,30,50,500]
self.x, self.y = self.coordFromKey(self.x_key, self.y_key)
def coordFromKey(self,x_key, y_key):
self.x, self.y = fillTableLinearly(self.x_key, self.y_key)
return self.x,self.y
#function system
#TODO : Finish writing this class, that for the most part for the moment is a copy of the table class - use the functions on the research guide sheet.
class funcTable:
"""
A class that contains the data (obtained by the function system) to count the penalties for the primer itself: you enter the constraint (for ex, base pair) and it returns its penalties graph
Arguments
--------
constraint : str
one of the constraint for an individual primer (like its length, it's melting temperature,...)
Method
------
coordFromKey : function
A function that returns the graph from the key points
"""
def __init__(self, constraint):
self.constraint = constraint
if(constraint == "bpLen"):
ok =1
elif(constraint == "TmNN"):
self.x_key = [0,20,45,52,58,65]
self.y_key = [150,100,30,0,30,50]
self.x, self.y = self.coordFromKey(self.x_key, self.y_key)
elif(constraint == "TmDiff"):
self.x_key = [0,3,5,11]
self.y_key = [0,5,50,100]
self.x, self.y = self.coordFromKey(self.x_key, self.y_key)
elif(constraint == "percentGC"):
self.x_key = [0,20,40,61,81]
self.y_key = [50,30,0,30,50]
self.x, self.y = self.coordFromKey(self.x_key, self.y_key)
elif(constraint == "countRuns"):
self.pMax = 100
self.n = 4
self.K = 3
elif(constraint == "clampGC"):
self.x_key = [0,3,5,6]
self.y_key = [20,0,20,0]
self.x, self.y = self.coordFromKey(self.x_key, self.y_key)
elif(constraint == "selfComplementarity"):
self.x_key = [0,1,6,11,21]
self.y_key = [0,5,20,30,50]
self.x, self.y = self.coordFromKey(self.x_key, self.y_key)
elif(constraint == "paForwardreverse"):
self.x_key = [0,1,6,11,21]
self.y_key = [0,5,20,30,50]
self.x, self.y = self.coordFromKey(self.x_key, self.y_key)
elif(constraint == "paTarget"):
self.x_key = [0,1,6,11,21]
self.y_key = [0,5,20,30,50]
self.x, self.y = self.coordFromKey(self.x_key, self.y_key)
def calculatePen(self,x):
return self.pMax*(x**self.n)/((self.K**self.n) + (x**self.n))
#****************DESCRIBE THE PRIMER*********************
class Sequence:
"""
A class that described the Sequence
Arguments
---------
seqDNA : str
a string containing the sequence of DNA of the whole sequence
target : str
a string containing the sequence of DNA of the target
target_start : num
the position where the target begins
target_end : num
the position where the target ends
"""
def __init__(self, seqDNA, target, start = 0):
self.seqDNA = seqDNA.upper()
#not sure why I wrote the following two statements
self.start = start
self.end = len(seqDNA)-1
if(target in seqDNA):
self.target = target
self.target_start = seqDNA.find(target)
self.target_end = self.target_start + len(self.target) - 1
class Primer:
"""
A class that describes the primer, giving access to each of its caracteristics and it's score (total penalties)
Arguments:
---------
seqDNA : str
a string containing the sequence of DNA describing the primer
direction : str
a string indicating if the primer is foward ("F") or reverse ("R"). By default, it's "F"
Attributes:
-----------
bpLen : num
the length of the primer
gcContent: num
the gcContent of the primer
TmNN : num
the melting temperature of the primer
countRuns : num
the number of runs in the primer
clampGC : num
the number of G or C in the 5 last nucleotides of the primer
selfComplementarity : num
the number of bonds between two example of this same primer (leds to primer dimers)
score : num
result of the the score method
Method:
-------
score(self) : function
A function that check the penalties of the primer in respect to each constraint and returns the total penalties
"""
def __init__(self, seqDNA, direction = "F"):
self.seqDNA = seqDNA.upper()
self.direction = direction #reverse or forward Primer Raise an error
self.bpLen = len(seqDNA)
self.gcContent = percentGC(seqDNA)
self.TmNN = TmNN(seqDNA)
self.countRuns = countRuns(seqDNA)
self.clampGC = clampGC(seqDNA)
self.selfComplementarity = selfComplementarity(seqDNA)
self.score = self.score()
def funcRuns_pen(self):
func = funcTable("countRuns")
return func.calculatePen(self.countRuns)
def score(self):
tot_pen = []
for constraint in listOfConstraints:
#print("constraint = ", constraint)
data = ScoreTable(constraint)
if(constraint == "bpLen"):
bpLen_pen = data.y[self.bpLen]
tot_pen.append(bpLen_pen)
print("bpLen_pen = ", bpLen_pen)
elif(constraint == "TmNN"):
TmNN_pen = data.y[round(self.TmNN)]
tot_pen.append(TmNN_pen)
print("TMnn_pen = ", TmNN_pen)
elif(constraint == "percentGC"):
percentGC_pen = data.y[self.gcContent]
tot_pen.append(percentGC_pen)
print("percentGC_pen = ", percentGC_pen)
elif(constraint == "countRuns"):
countRuns_pen = data.y[self.countRuns]
tot_pen.append(countRuns_pen)
print("countRuns_pen = ", countRuns_pen)
elif(constraint == "clampGC"):
clampGC_pen = data.y[self.clampGC]
tot_pen.append(clampGC_pen)
print("clampGC_pen = ", clampGC_pen)
elif(constraint == "selfComplementarity"):
selfComplementarity_pen = data.y[self.selfComplementarity]
tot_pen.append(selfComplementarity_pen)
print("selfComplementarity_pen = ", selfComplementarity_pen)
return sum(tot_pen)
class pairPrimer:
"""
A class that describes the pair of primer, giving access to each of its caracteristics and it's score (total penalties)
Arguments:
---------
forward : Primer
the forward primer of the pair (with access to all of its attributes)
reverse : Primer
the reverse primer of the pair (with access to all of its attributes)
seq : Sequence
the total sequence of DNA given by the user (where the primers binds, that contains the target sequence,...)
Attributes:
-----------
f : str
the sequence of DNA of the forward primer
r : str
the sequence of DNA of the reverse primer
seqDNA : str
the forward and the reverse primer reunited
TmDiff : num
the difference in the melting temperature between the two primers
paForwardreverse : num
the complementarity between the forward and the reverse primer
paTargetF : num
the complementarity between the forward primer and the target
paTargetR : num
the complementarity between the reverse primer and the target
Method:
-------
score(self) : function
A function that check the penalties of the pair of primer in respect to each constraint and returns the total penalties
"""
def __init__(self, forward, reverse, seq):
self.forward = forward
self.reverse = reverse
self.f = forward.seqDNA
self.r = reverse.seqDNA
self.seqDNA = "F : " + self.f + " R : " + self.r
self.TmDiff = TmDiff(self.f,self.r)
self.paForwardreverse = pa(self.f, self.r)
self.paTargetF = pa(self.f, seq.target)
self.paTargetR = pa(self.r, seq.target )
self.score = self.score() + self.forward.score + self.reverse.score
def score(self):
tot_pen = []
for constraint in listOfConstraintsPair:
#print("constraint = ", constraint)
data = ScoreTablePairs(constraint)
if(constraint == "TmDiff"):
TmDiff_pen = data.y[round(self.TmDiff)]
tot_pen.append(TmDiff_pen)
print("TmDiff = ", TmDiff_pen)
elif(constraint == "paForwardreverse"):
paForwardreverse_pen = data.y[self.paForwardreverse]
tot_pen.append(paForwardreverse_pen)
print("paForwardreverse = ", paForwardreverse_pen)
elif(constraint == "paTargetF"):
paTargetF_pen = data.y[self.paTargetF]
tot_pen.append(paTargetF_pen)
print("paTargetF = ", paTargetF_pen)
elif(constraint == "paTargetR"):
paTargetR_pen = data.y[self.paTargetR]
tot_pen.append(paTargetR_pen)
print("paTargetR = ", paTargetR_pen)
return sum(tot_pen)
# print("\n \n *****DESCRIPTION OF THE PRIMERS P AND Q ******* \n")
# p = Primer("GGATTGATAATGTAATAGG","F")
# q = Primer("CATTATGGGTGGTATGTTGG","R")
# print("p : ",p.__dict__, "\n")
# print("q : ",q.__dict__)
# print("\n \n")
# print("******PENALTIES OF P (based on the table in Research Guide Sheet)******")
# ps = p.score()
# print("\n tot pen of P = ", sum(ps))
# print("\n \n")
# print("********PENALTIES OF Q (based on the table in Research Guide Sheet) ****** ")
# qs = q.score()
# print("\n tot pen of Q = ", sum(qs))
# print("\n \n")
# print("***** Counting penalties for runs with p = pMax * N^n / (K^n + N^n) *****")
# #TO DO : change from funcRuns to funcAll(runs)
# print("countRuns_pen of p = ", p.funcRuns_pen())
# print("countRuns_pen of q = ", q.funcRuns_pen())
#print("pa(p,q) = ", pa(p.seqDNA,q.seqDNA), "pea(p,q) = ", pea(p.seqDNA,q.seqDNA))
#************LOOK THROUGH ALL THE PRIMERS AND FIND THE BEST ONE************
seqDNA = "CCTTGGCCTCTGCCTAATCACACAGATTCTAACAGGATTATTTCTCGCAATACACTACACAGCTGACATCTCAACAGCCTTCTCCTCCGTCGCCCACATCTGTCGAGATGTTAACTACGGATGACTAATTCGAAACATTCATGCAAACGGAGCCTCCTTTTTCTTCATCTGCCTCTACCTTCACGTAGCCCGAGGCATATACTATGGCTCATACCTCTACAAAGAAACCTGAAACATCGGAGTAGTTCTCCTACTCCTAACTATAATAACCGCCTTCGTAGGATATGTGCTCCCATGAGGACAGATATCCTTCTGAGGAGCCACCGTAATTACCAACCTTCTTTCCGCCTTCCCCTACATCGGGGACACCCTAGTACAATGAATCTGAGGTGGTTTCTCAGTAGACAACGCCACCCTAACC"
target = "TAGCCCGAGGCATATACTATGGCTCATACCTCTACAAAGAAACCTGAAACATCGGAGTAGTTCTCCTACTCCTAACTATAATAACCGCCTTCGTAGGATATGTGCTCCCATGAGGACAGATATCCTTCTGAGGAGCC"
seq = Sequence(seqDNA, target)
def findPrimer(seq, direc = "F"):
"""A function that takes in entry the whole sequence of DNA and the direction of the primers we are looking for (by default "F" for forward) and returns a list of the possibles primers"""
allP = [] #contains all the primers (of all lengths)
bpMin = 15 #mininum base length
bpMax = 35 # maximum base length
if bpMax > len(seq.seqDNA): #we don't want to have this absurdity of a primer bigger than a sequence - so in case the sequence provided by the user is very short, we have to go down with the length of the primer
bpMin = 0
bpMax = round(len(seq.seqDNA)/3)
for bp in range(bpMin, bpMax):
#we test for the primers of all lengths possibles
allbpP = [] #contains all the primers of length bp
if direc == "F":
for i in range(seq.start, seq.target_start - bp, 1): #the first forward primer will have his start on the start of the whole DNA sequence - and the last primer will have his end on the start of the target sequence that we don't want to bind on
p = Primer(seq.seqDNA[i:i+bp])
allbpP.append(p)
elif direc == "R":
for i in range(seq.end, seq.target_end + bp, -1):
p = Primer(seq.seqDNA[i-bp:i])
allbpP.append(p)
allP.append(allbpP)
print("allbpP = ", allbpP)
return allbpP
def findPair(allF, allR, seq):
"""
A function that takes in entry all the possible forwards (allF) and reverse (allR) primers in the sequence (seq) and returns the list of all the possible pairs of primers sorted by penalties score
"""
#allF, allR = sorted(allF1), sorted(allR1)
#took the following code from here : https://stackoverflow.com/questions/403421/how-to-sort-a-list-of-objects-based-on-an-attribute-of-the-objects/48731059
try: import operator
except ImportError: keyfun= lambda x: x.score # use a lambda if no operator module
else: keyfun= operator.attrgetter("score") # use operator since it's faster than lambda
allF.sort(key=keyfun, reverse=True) # sort in-place
allR.sort(key=keyfun, reverse=True) # sort in-place
allPair = []
for f in allF:
for r in allR:
pair = pairPrimer(f, r, seq)
allPair.append(pair)
try: import operator
except ImportError: keyfun= lambda x: x.score # use a lambda if no operator module
else: keyfun= operator.attrgetter("score") # use operator since it's faster than lambda
allPair.sort(key=keyfun, reverse=True) # sort in-place
print("SCORE FOR PAIR OF PRIMERS")
for i in range(0,len(allPair)):
print(allPair[i].seqDNA, " = ", allPair[i].score)
print("***************** IN THE F DIRECTION ***********************")
fsc = findPrimer(seq)
print("********************IN THE R DIRECTION ********************")
rsc = findPrimer(seq, direc = "R")
findPair(fsc, rsc, seq)
#**************MAKE A GRAPHIC INTERFACE*************************
And on this one to get the code in R (i am not just it's the most up-to-date version I have):
library("TmCalculator")
library("stringi")
listOfEnzymes = list("XbaI" = "TCTAGA", "EcoRI" = "GAATTC", "SpeI" = "ACTAGT", "NotI" = "GCGGCCGC", "PstI" = "CTGCAG")
percentGC <- function(seqDNA){
seqsplit <- unlist(strsplit(seqDNA, split=""))
countG <- 0
countC <- 0
for(x in seqsplit){
if (x == "G"){
countG <- countG + 1
}
if (x == "C"){
countC <- countC + 1
}
}
return(((countC + countG)/nchar(seqDNA))*100)
}
clampGC <- function(seqDNA){
last5 <- stri_sub(seqDNA,-5,-1)
last5split <- unlist(strsplit(last5, split=""))
paste(last5split)
count <- 0
for (i in last5split){
if(i=="G" | i == "C" ){
count <- count + 1
}
}
return (count)
}
count2nuc <- function(seqDNA, nuc1, nuc2){
seqsplit <- unlist(strsplit(seqDNA, split=""))
listOfRepeats <- list()
nuc3 <- paste(nuc1, nuc2,sep='')
count <- 0
i <- 1
index <- 1
while(i < nchar(seqDNA)){
#print(paste(unlist(seqsplit[i:(i+1)]), collapse=''))
if(paste(unlist(seqsplit[i:(i+1)]), collapse='') == nuc3){
paste("hello")
count <- count + 1
i <- i + 2
}
else{
if(count != 0) {
listOfRepeats[[index]] <- count
index <- index + 1
}
i <- i + 1
count <- 0
}
}
listOfRepeats[[index]]<- count
return(listOfRepeats)
}
countRepeats <- function(seqDNA){
lnuc <- c("A", "T", "C", "G")
totRepeats <- list()
for (m in lnuc){
for (n in lnuc){
if (m != n){
totRepeats[[paste(m,n, sep='')]] <- count2nuc(seqDNA, m,n)
}
}
}
return(totRepeats)
}
countnuc <- function(seqDNA, nuc){
seqsplit <- unlist(strsplit(seqDNA, split=""))
run <- 0
listOfRuns <- list()
index <- 1
for(i in 1:nchar(seqDNA)){
if(seqsplit[i] == nuc){
run <- run + 1
}
else{
if(run != 0){
listOfRuns[[index]] <- run
index <- index + 1
}
run <- 0
}
}
listOfRuns[[index]] <- run
return(listOfRuns)
}
countRuns <- function(seqDNA){
lnuc <- c("A","T", "C", "G")
totRuns <- list()
for(m in lnuc){
totRuns[[m]] <- countnuc(seqDNA, m)
}
return(totRuns)
}
findRestric <- function(seqDNA){
restric <- 0
seqsplit <- unlist(strsplit(seqDNA, split=""))
for(enzyme in listOfEnzymes ){
if(grepl(enzyme, seqDNA)){
restric = restric + 1
}
}
return (restric)
}
s <- function(a,b){
if((a == "A" & b == "T") | (a == "T" & b == "A")){
return (2)
}
else if((a == "C" & b == "G") | (a == "G" & b == "C")){
return (4)
}
else{
return (0)
}
}
# fillRange <- function(xstart, xend, yvalue){
# range_y <- list()
#
# for(i in (xstart:xend)){
# range_y[[i]] <- yvalue
# }
# return (range_y)
# }
#
# #https://stackoverflow.com/questions/43477069/how-to-sort-list-in-r
# #sortBy <- function(a, field) a[order(sapply(a, "[[", i = field))]
#
# fillTable <- function(keyX, keyY){
# table_y <- list()
#
# n <- length(keyX) - 1
# print(length(keyY))
# if( n+1 == length(keyY)){
# for(i in (1:n)){
# range_y <- fillRange(keyX[[i]], keyX[[i+1]], keyY[i])
# paste("range = ", range_y)
# table_y[[i]] <- range_y
#
# }
# }
# return (table_y)
#
# }
#this function returns the number of bonds (higher score, higher probability to bind)between two sequences
S <- function(x,y){
align <- list()
if(nchar(x)>nchar(y)){
temp <- x
x <- y
y <- temp
}
n <- nchar(x)
m <- nchar(y)
for(i in 1:(m-1)){
y <- paste("o", y,"o", sep="")
}
xsplit <- unlist(strsplit(x, split=""))
ysplit <- unlist(strsplit(y, split=""))
i<-1
for(k in (m-1):(3*(m-1))){
tot <- 0
for(i2 in 1:n){
#print(c(xsplit[i2], ysplit[k-i2+2]))
tot <- tot + s(xsplit[i2],ysplit[k-i2+2])
#print(c("->", tot))
}
#print(c("TOT", "->", tot))
align[[i]] <- tot
i <- i +1
}
return(align)
}
Sprime <- function(x,y){
align <- list()
if(nchar(x)>nchar(y)){
temp <- x
x <- y
y <- temp
}
n <- nchar(x)
m <- nchar(y)
for(i in 1:(m-1)){
y <- paste("o", y,"o", sep="")
}
xsplit <- unlist(strsplit(x, split=""))
ysplit <- unlist(strsplit(y, split=""))
i<-1
for(k in (m-1):(3*(m-1))){
tot <- 0
for(i2 in 1:n){
#print(c(xsplit[i2], ysplit[k-i2+2]))
bond <- s(xsplit[i2],ysplit[k-i2+2])
if(bond == 0 & ysplit[k-i2 +2] != 'o'){
break
}
tot <- tot + bond
#print(c("->", tot))
}
#print(c("TOT", "->", tot))
align[[i]] <- tot
i <- i +1
}
return(align)
}
pa <- function(p, q){
return(max(unlist(S(p,q))))
}
pea <- function(p,q){
return(max(unlist(Sprime(p,q))))
}
sea <- function(seqDNA){
return(max(unlist(Sprime(stri_reverse(seqDNA), seqDNA))))
}
selfComplementarity <- function(seqDNA){
return(max(unlist(S(seqDNA, seqDNA))))
}
R SHINY
library(shiny)
ui <- fluidPage(
titlePanel("Primer design software"),
sidebarLayout(
sidebarPanel(
selectInput("selectConstraint", "Modify a constraint", choices = c("Primer length", "Melting Temp")),
#give the possibility
sliderInput("rangeX", "X", min = 0, max = 50, value = c(25, 30)),
numericInput("numY", "Y", "0"),
tableOutput("function")
),
mainPanel(
)
)
)
server <- function(input, output) {
}
shinyApp(ui = ui, server = server)
So, in this page, I would like to present this software. The chapter 0 explains how to use it, the chapter 1 and 2 how it was programmed and the chapter 3, what would be to improve next.
0 - How to use the programm
Copy-paste the whole python code in repl.it ;
Replace the string contained in the variable seqDNA (line 807) with the total sequence of DNA that is going to enter in the PCR;
Replace the string contained in the variable target (line 809) with the part of the total sequence of DNA you want to duplicate (and so, the primers must be around it)
Run the program
Wait for a dozen of minutes
There should appear a list of suggested primers, with the one at the bottom being the most recommended
I - Code of each constraint
So, each primer must respect a list of constraints. This list varies from website to website, but in general, we can find something like written below. (I mainly took my informations from this website : http://bioweb.uwlax.edu/GenWeb/Molecular/seq_anal/primer_design/primer_design.htm who himself adapted it from Innis and Gelfand,1991; I changed some value if I was seeing a lot of website using different ones).
The coding of some of those constraints is less obvious than other, so under them, I am writing more indications !
1. primers should be 17-28 bases in length;
2. The GC content should be 50-60%
3. primers should end (3') in a G or C, or CG or GC:
4. The melting temperature should be between 52 and 58 degree Celsius
They are several ways to calculate the melting temperature of a sequence of DNA, like the basic rule and the nearest-neighbour method. (For the different formulas, see Alejandro Panjkovich, Francisco Melo, Comparison of different melting temperature calculation methods for short DNA sequences, Bioinformatics, Volume 21, Issue 6, 15 March 2005, Pages 711–722, https://doi.org/10.1093/bioinformatics/bti066 at the paragraph melting temperatures calculation ; for an explanation of how the nearest-neighbour method works, see Albert Courey, Nearest-Neighbour method, https://www.youtube.com/watch?v=jy5h9NwBEgk ; ). The formula for this method differs a bit in every article I saw. This might be caused for example, by the use of different experimental data from one place of to another; some use Breslauer tables, some use SantaLucia, ... (see table 1 of this article ). Also, I think sometimes I was seeing differences from one table to an other what were probably errors of copy - so I think it's a good time to think that a good science principle is to double-check what is being said (especially double-checking what an undergraduate student like me says too !) and try to get as many points of view and references as possible !
In conclusion, this is the formula I used :
The S and H variables where found with an algorithm constructed following the tutorial I recommended above ( video ).
S = ΔS = entropy change in kcal K-1 ᐧ mol-1 (accounts for energy unable to do work, i.e. disorder)
H = ΔH = enthalpy change in kcal mol-1 (accounts for the energy change during annealing / melting)
This is the python program I wrote:
def delta_H_S(p1,p2):
"""
A function that returns the delta_H (free enthalpy in kcal/mol) and the delta_S (entropy in cal/K) of a pair of nuclotides, using the tables from Breslauer (1986)
Parameters
----------
p1 : str
a one-letter string ("A", "T", "C" or "G") to represent the first nucleotide
p2 : str
a one-letter string to represent the second nucleotide of the pair
"""
p3 = str(p1) + str(p2) #concatenate the two nucleotides
if(p3 == "AA" or p3 =="TT"):
H = 9.1
S = 24.0
elif(p3 == "AT"):
H = 8.6
S = 23.9
elif(p3 == "TA"):
H = 6.0
S = 16.9
elif(p3 == "CA" or p3 == "TG"):
H = 5.8
S = 12.9
elif(p3 == "GT" or p3 == "AC"):
H = 6.5
S = 17.3
elif(p3 == "CT" or p3 == "AG"):
H = 7.8
S = 20.8
elif(p3 == "GA" or p3 == "TC"):
H = 5.6
S = 13.5
elif(p3 == "CG"):
H = 11.9
S = 27.8
elif(p3 == "GC"):
H = 11.1
S = 26.7
elif(p3 == "GG" or p3 == "CC"):
H = 11.0
S = 26.6
#this is working for oligo
#return H, S/1000
return H, S
def sum_deltas_H_S(x):
"""
A function that returns the total enthalpy (Htot) and entropy (S) of a sequence of DNA, summing the delta_enthalpy (h) and delta_entropy (s) of each pair of nucleotides. For example, to calculate H of "AATTGCCG" we add up h of "AA", h of "AT", h of "TT", ...
Parameters
----------
x : str
the sequence of DNA from who enthalpy and entropy is calculated
"""
H, S = 0,0
for i in range(0, len(x)-1):
h, s = delta_H_S(x[i],x[i+1])
#print("h = ", h, "s = ", s)
H = H + h
S = S + s
#print("H = ", H, "S = ", S)
return H, S
I don't achieve to retrieve the file right now (slow internet connection)- but I did compare the result I had with my programm to other programms, and it was well in the average.
5. 3'-ends of primers should not be complementary (ie. base pair), as otherwise primer dimers could be form (when primer stick together instead of sticking to the sequence that must be replicated).
6. primer self-complementarity (ability to form 2o structures such as hairpins) should be avoided;
Due to low internet connections, I don't achieve to check it, but I think the link of the article from where I took my algorithms was this one
len(y)): #it's easier to code if x is always smaller than y
x,y = y,x
n,m = len(x), len(y)
for i in range(m-1):
y = "o" + y + "o" #to not provoke an error
for k in range(m-1,3*(m-1)+1):
tot = 0
#print("For km = ", k-2*(m-1), "soit k = ", k)
for i2 in range(0, n):
#print(x[i2], y[k-i2])
tot = tot + s(x[i2],y[k-i2])
#print("-> ", tot)
align.append(tot)
#print(align)
return align
def Sprime(x,y):
"""this function returns the number and strengh of bonds (higher score, higher probability to bind)between two sequences in an ininterrupted way from the start in different alignements"""
#the same code than for S except that you break if there is an interruption in the bond making
x = x.upper()
y = y.upper()
align = []
if(len(x) > len(y)):
temp = x
x = y
y = temp
n,m = len(x), len(y)
for i in range(m-1):
y = "o" + y + "o"
for k in range(m-1,3*(m-1)+1):
tot = 0
print("For km = ", k-2*(m-1), "soit k = ", k)
for i2 in range(0, n):
bond = s(x[i2],y[k-i2])
print(x[i2], y[k-i2])
print("k-i2 = ", k-i2)
if(bond == 0 and y[k-i2] != 'o'): #the first bond between the two sequences has to score more than 0
break
tot = tot + bond
print("-> ", tot)
align.append(tot)
#print(align)
return align
def pa(p,q):
"""so pair annealing"""
return max(S(p,q))
def pea(p,q):
"""so pair-end annealing"""
return max(Sprime(p,q))
#why is that working like that, who knows
def sea(seqDNA):
"""self-end annealing"""
return max(Sprime(seqDNA[::-1],seqDNA))
def selfComplementarity(seqDNA):
"""self annealing"""
return max(S(seqDNA, seqDNA))
On R:
s <- function(a,b){
if((a == "A" & b == "T") | (a == "T" & b == "A")){
return (2)
}
else if((a == "C" & b == "G") | (a == "G" & b == "C")){
return (4)
}
else{
return (0)
}
}
#this function returns the number of bonds (higher score, higher probability to bind)between two sequences
S <- function(x,y){
align <- list()
if(nchar(x)>nchar(y)){
temp <- x
x <- y
y <- temp
}
n <- nchar(x)
m <- nchar(y)
for(i in 1:(m-1)){
y <- paste("o", y,"o", sep="")
}
xsplit <- unlist(strsplit(x, split=""))
ysplit <- unlist(strsplit(y, split=""))
i<-1
for(k in (m-1):(3*(m-1))){
tot <- 0
for(i2 in 1:n){
#print(c(xsplit[i2], ysplit[k-i2+2]))
tot <- tot + s(xsplit[i2],ysplit[k-i2+2])
#print(c("->", tot))
}
#print(c("TOT", "->", tot))
align[[i]] <- tot
i <- i +1
}
return(align)
}
Sprime <- function(x,y){
align <- list()
if(nchar(x)>nchar(y)){
temp <- x
x <- y
y <- temp
}
n <- nchar(x)
m <- nchar(y)
for(i in 1:(m-1)){
y <- paste("o", y,"o", sep="")
}
xsplit <- unlist(strsplit(x, split=""))
ysplit <- unlist(strsplit(y, split=""))
i<-1
for(k in (m-1):(3*(m-1))){
tot <- 0
for(i2 in 1:n){
#print(c(xsplit[i2], ysplit[k-i2+2]))
bond <- s(xsplit[i2],ysplit[k-i2+2])
if(bond == 0 & ysplit[k-i2 +2] != 'o'){
break
}
tot <- tot + bond
#print(c("->", tot))
}
#print(c("TOT", "->", tot))
align[[i]] <- tot
i <- i +1
}
return(align)
}
pa <- function(p, q){
return(max(unlist(S(p,q))))
}
pea <- function(p,q){
return(max(unlist(Sprime(p,q))))
}
sea <- function(seqDNA){
return(max(unlist(Sprime(stri_reverse(seqDNA), seqDNA))))
}
selfComplementarity <- function(seqDNA){
return(max(unlist(S(seqDNA, seqDNA))))
}
7. runs of three or more Cs or Gs at the 3'-ends of primers may promote mispriming at G or C-rich sequences should be avoided.
II - Select the best primer respecting all the constraints
There is the case where it’s impossible to find a primer that respects perfectly all those constraints. In this case, we have to choose a primer that respects the most important, because it’s still possible that the experiment works, if for example, the primer is one nucleotide too long according to some website.
So I decided to use a score system, with the primer with the lowest score being the best.
For example, let’s say that the primer is 10 bp length, which is bad so, he would get a +10 score; if the primer is 15 bp length, it’s still bad, but less, so he would get a +5 score; and if it’s 20 length, then it’s good, so the primer would get +0 points. For problems that are too bad, put a +1000 should eliminate the primer for sure
To choose what points to assign to what constraints, I saw two methods :
1 - using tables with key points :
I was thinking of the system of graph editor used in 3d animation software, for example like in Blender : https://docs.blender.org/manual/en/latest/editors/graph_editor/fcurves/introduction.html. You choose a few key points and the computer calculate the images in between those images. You can also choose how the computer calculates those in between (interpolation mode), like linearly, exponentially, constant.
2 - using functions (
A function F that to a value X attributes a value Y with X being the characteristic of the actual sequence of DNA and Y the penalty it returns.
For example, for the number of repetitions in a sequence :
(the following part is taken from an e-mail my Professor sent me)
pMax = maximal penalty
N = number of repetitions in sequence (e.g., N=4 for AAAA or GGGG and for AGAGAGAG, GAGAGAGA)
n = function parameter
K= function parameter
p = pMax * N^n / (K^n + N^n)
With reasonables values being:
pMax = 100
K = 3 (penalty is 50 for 3 repeats)
n = 4
(end of the part my Professor sent me).
Here is a comparison of the two methods:
because of time of uploads and the fact I have 8 picture in this section, I do not upload this section on the website
Using the key point method, this is the table I wrote. I choose the score points from an average of what is the worst in PCR that I got after reading several articles - but a biologist that have actually experience in the wet lab could probably provide a better point system.
TABLE 1 : CONSTRAINT BASED ON THE SINGLE PRIMER ITSELF (FORWARD OR REVERSE)
IV - What stays to improve:
The most important thing to do would be to check that this software give the same results than other primer design software or manual selection !
Maybe resources as this could help :
Kumar A., Chordia N. (2015) In Silico PCR Primer Designing and Validation. In: Basu C. (eds) PCR Primer Design. Methods in Molecular Biology, vol 1275. Humana Press, New York, NY
Then, it would be great to improve the documentation, double-check everything I wrote and find an academic scientific proof for the values and formulas I found on the internet ! (I wrote this article on the deadline day on the 21 October 2019 with pieces of what I wrote and what I remembered from August 2019; so although I think this page could be of use for future developers, mistakes are possible !).
Create a more user friendly environment. In R, the pist I have would be to use Shiny. In Python, there is Tkinter.
Solve the time problems (see Big O notation). Given the time my software takes to give an answer, I probably wrote a horrible algorithm of brutal force ! Here is the part that is probably taking the most time :
def findPrimer(seq, direc = "F"):
"""A function that takes in entry the whole sequence of DNA and the direction of the primers we are looking for (by default "F" for forward) and returns a list of the possibles primers"""
allP = [] #contains all the primers (of all lengths)
bpMin = 15 #mininum base length
bpMax = 35 # maximum base length
if bpMax > len(seq.seqDNA): #we don't want to have this absurdity of a primer bigger than a sequence - so in case the sequence provided by the user is very short, we have to go down with the length of the primer
bpMin = 0
bpMax = round(len(seq.seqDNA)/3)
for bp in range(bpMin, bpMax):
#we test for the primers of all lengths possibles
allbpP = [] #contains all the primers of length bp
if direc == "F":
for i in range(seq.start, seq.target_start - bp, 1): #the first forward primer will have his start on the start of the whole DNA sequence - and the last primer will have his end on the start of the target sequence that we don't want to bind on
p = Primer(seq.seqDNA[i:i+bp])
allbpP.append(p)
elif direc == "R":
for i in range(seq.end, seq.target_end + bp, -1):
p = Primer(seq.seqDNA[i-bp:i])
allbpP.append(p)
allP.append(allbpP)
print("allbpP = ", allbpP)
return allbpP
def findPair(allF, allR, seq):
"""
A function that takes in entry all the possible forwards (allF) and reverse (allR) primers in the sequence (seq) and returns the list of all the possible pairs of primers sorted by penalties score
"""
#allF, allR = sorted(allF1), sorted(allR1)
#took the following code from here : https://stackoverflow.com/questions/403421/how-to-sort-a-list-of-objects-based-on-an-attribute-of-the-objects/48731059
try: import operator
except ImportError: keyfun= lambda x: x.score # use a lambda if no operator module
else: keyfun= operator.attrgetter("score") # use operator since it's faster than lambda
allF.sort(key=keyfun, reverse=True) # sort in-place
allR.sort(key=keyfun, reverse=True) # sort in-place
allPair = []
for f in allF:
for r in allR:
pair = pairPrimer(f, r, seq)
allPair.append(pair)
try: import operator
except ImportError: keyfun= lambda x: x.score # use a lambda if no operator module
else: keyfun= operator.attrgetter("score") # use operator since it's faster than lambda
allPair.sort(key=keyfun, reverse=True) # sort in-place
A way to make it faster would be probably to cut the program once we have found the 20 first best primers
Conclusion
Well, I hope this can help someone ! It was for me a wonderful opportunity to work with the iGEM team this summer. I learned a great amount, above all about biology and coding (I am a math student). If you need anything else concerning this page, please contact the uOttawa iGEM team, I stay available to help !