src = ['gbbct', 'gbpln.spsum', 'gbvrt.spsum', 'gbvrl.spsum', 'gbrod.spsum', 'gbpri.spsum', 'gbphg.spsum', 'gbmam.spsum', 'gbinv.spsum']
fulltext = '['
i = 0
#read each species-type file in turn.
for spec in range(len(src)):
bacteria = open('workingFolder/' + src[spec])
bacteria = bacteria.read()
bacteria = bacteria.split('\n')
codonUsage = []
for index in range(len(bacteria)):
#parse and split the source data by spaces and colons.
if index%2 == 0:
try:
bacteria[index] = bacteria[index]+":"+bacteria[index+1]
bacteria[index]=bacteria[index].split(':')
if (len(bacteria[index]))==7:
bacteria[index][1] = bacteria[index][1] + bacteria[index][2] + bacteria[index][3] + bacteria[index][4]
del bacteria[index][2:4]
if (len(bacteria[index]))==6:
bacteria[index][1] = bacteria[index][1] + bacteria[index][2] + bacteria[index][3]
del bacteria[index][2:3]
if len(bacteria[index])==5:
bacteria[index][1] = bacteria[index][1] + bacteria[index][2]
del bacteria[index][2]
codonUsage.append(bacteria[index])
except:
pass
bacteria = []
for codonUsageItem in codonUsage:
#codonUsageItem = codonUsageItem.split(":")
#for item in range(len(codonUsageItem)):
# codonUsageItem[item] = codonUsageItem[item].split()
bacteria.append(codonUsageItem)
#lookup table of codons and amino acids
text = ""
arg = ['CGA', 'CGC', 'CGG', 'CGT', 'AGA', 'AGG']
leu = ['CTA', 'CTC', 'CTG', 'CTT', 'TTA', 'TTG']
ser = ['TCA', 'TCC', 'TCG', 'TCT', 'AGC', 'AGT']
thr = ['ACA', 'ACC', 'ACG', 'ACT']
pro = ['CCA', 'CCC', 'CCG', 'CCT']
ala = ['GCA', 'GCC', 'GCG', 'GCT']
gly = ['GGA', 'GGC', 'GGG', 'GGT']
val = ['GTA', 'GTC', 'GTG', 'GTT']
lys = ['AAA', 'AAG']
asn = ['AAC', 'AAT']
gln = ['CAA', 'CAG']
his = ['CAC', 'CAT']
glu = ['GAA', 'GAG']
asp = ['GAC', 'GAT']
tyr = ['TAC', 'TAT']
cys = ['TGC', 'TGT']
phe = ['TTC', 'TTT']
ile = ['ATA', 'ATC', 'ATT']
met = ['ATG']
trp = ['TGG']
stp = ['TAA', 'TAG', 'TGA']
triplets = ['CGA', 'CGC', 'CGG', 'CGT', 'AGA', 'AGG', 'CTA', 'CTC', 'CTG', 'CTT', 'TTA', 'TTG', 'TCA', 'TCC', 'TCG', 'TCT', 'AGC', 'AGT', 'ACA', 'ACC', 'ACG', 'ACT', 'CCA', 'CCC', 'CCG', 'CCT', 'GCA', 'GCC', 'GCG', 'GCT', 'GGA', 'GGC', 'GGG', 'GGT', 'GTA', 'GTC', 'GTG', 'GTT', 'AAA', 'AAG', 'AAC', 'AAT', 'CAA', 'CAG', 'CAC', 'CAT', 'GAA', 'GAG', 'GAC', 'GAT', 'TAC', 'TAT', 'TGC', 'TGT', 'TTC', 'TTT', 'ATA', 'ATC', 'ATT', 'ATG', 'TGG', 'TAA', 'TAG', 'TGA']
#for each species. Ignore the bacteria, script tested against the bacteria file.
for bacterium in bacteria:
bacterium[1] = bacterium[1].replace('"',"'")
if (len(bacterium)) != 4:
print(bacterium[0])
if int(bacterium[2]) < 10:
continue
text = text + "{"
row = '"speciesID":' + str(i) + ","
#build species metadata
row = row + '"speciesName":"' + bacterium[1] + '",'
#row = row + '"speciesType":' + speciesName[spec] + ','
row = row + '"CDS":' + bacterium[2] + ',"nbCodons":'
j = 0
sumCodons = 0
argtxt = '"Arg":{'
leutxt = '"Leu":{'
sertxt = '"Ser":{'
thrtxt = '"Thr":{'
protxt = '"Pro":{'
alatxt = '"Ala":{'
glytxt = '"Gly":{'
valtxt = '"Val":{'
lystxt = '"Lys":{'
asntxt = '"Asn":{'
glntxt = '"Gln":{'
histxt = '"His":{'
glutxt = '"Glu":{'
asptxt = '"Asp":{'
tyrtxt = '"Tyr":{'
cystxt = '"Cys":{'
phetxt = '"Phe":{'
iletxt = '"Ile":{'
mettxt = '"Met":{'
trptxt = '"Trp":{'
stptxt = '"STOP":{'
amAcid = ',"AminoAcids":{'
codonCnt = ',"Codons":{'
for value in bacterium[3].split(' '):
if value == '':
pass
#check the lookup table for which amino acid the codon codes for
else:
append = '"' + triplets[j] + '":' + value + ','
if triplets[j] in arg:
argtxt = argtxt + append
elif triplets[j] in leu:
leutxt = leutxt + append
elif triplets[j] in ser:
sertxt = sertxt + append
elif triplets[j] in thr:
thrtxt = thrtxt + append
elif triplets[j] in pro:
protxt = protxt + append
elif triplets[j] in ala:
alatxt = alatxt + append
elif triplets[j] in gly:
glytxt = glytxt + append
elif triplets[j] in val:
valtxt = valtxt + append
elif triplets[j] in lys:
lystxt = lystxt + append
elif triplets[j] in asn:
asntxt = asntxt + append
elif triplets[j] in gln:
glntxt = glntxt + append
elif triplets[j] in his:
histxt = histxt + append
elif triplets[j] in glu:
glutxt = glutxt + append
elif triplets[j] in asp:
asptxt = asptxt + append
elif triplets[j] in tyr:
tyrtxt = tyrtxt + append
elif triplets[j] in cys:
cystxt = cystxt + append
elif triplets[j] in phe:
phetxt = phetxt + append
elif triplets[j] in ile:
iletxt = iletxt + append
elif triplets[j] in met:
mettxt = mettxt + append
elif triplets[j] in trp:
trptxt = trptxt + append
elif triplets[j] in stp:
stptxt = stptxt + append
#codonCnt = codonCnt + '"' + triplets[j] +'":' + val
sumCodons = sumCodons + int(value)
#if j < 63:
#codonCnt = codonCnt + ','
j+=1
#last codon, get rid of the final ']'
amAcid = amAcid + argtxt[:-1] + '},' + leutxt[:-1] + '},' + sertxt[:-1] + '},' + thrtxt[:-1] + '},' + protxt[:-1] + '},' + alatxt[:-1] + '},' + glytxt[:-1] + '},' + valtxt[:-1] + '},' + lystxt[:-1] + '},' + asntxt[:-1] + '},' + glntxt[:-1] + '},' + histxt[:-1] + '},' + glutxt[:-1] + '},' + asptxt[:-1] + '},' + tyrtxt[:-1] + '},' + cystxt[:-1] + '},' + phetxt[:-1] + '},' + iletxt[:-1] + '},' + mettxt[:-1] + '},' + trptxt[:-1] + '},' + stptxt[:-1] + '}'
row = row + str(sumCodons) + amAcid + '}},\n'
text = text + row
i +=1
fulltext = fulltext + text
#write to output
bacteriaScript = open('data.txt','w')
bacteriaScript.write(fulltext[:-2]+']')
bacteriaScript.close()