Question
I need help on how to read a string sequence(genetic sequence) frame by frame in C++. I have found all the start codons(ATG) and stop
I need help on how to read a string sequence(genetic sequence) frame by frame in C++. I have found all the start codons(ATG) and stop codons(TAG, TGA, TAA) all in ascending order. Next step is to find a start(ATG) that ends with either (TAG, TGA, TAA) and use substr to get the potential gene. I have some code started on doing the frames but I am not sure on how to continue. The frames basically follows this:
int main()
{
string fileName;
cout
cin >> fileName;
fstream fileStream(fileName, ios::in);
if (fileStream.fail())
{
cout
exit(1);
}
cout
char dummyChar = '>';
fileStream >> dummyChar;
string dummyString;
getline(fileStream, dummyString);
cout
cout
string genome,fasta;
int length;
float cTotal, gTotal, gcTotal, gcPercent;
while (!fileStream.eof())
{
getline(fileStream,genome); // Saves the line in genome.
fasta += genome;//appending each line to string fasta
}
length = fasta.length();
cTotal = countGenes(fasta, 'C'); // Acquires total amount of "C" genes
gTotal = countGenes(fasta, 'G'); // Acquires total amount of "G" genes
gcTotal = (gTotal + cTotal); // Total amount of both "C" and "G" genes
gcPercent = ((gcTotal / fasta.size()) * 100); // Total percentage of both "C" and "G" genes out of all genes
cout
vector startCodonPositions;
startCodon(fasta, startCodonPositions, "ATG");
cout
vector
startCodon(fasta, stopCodonPositions, "TAG");
startCodon(fasta, stopCodonPositions, "TGA");
startCodon(fasta, stopCodonPositions, "TAA");
cout
// sorting the stop codons by acending order
sort(stopCodonPositions.begin(), stopCodonPositions.end(), sortAcend);
// printing the stop codons in ascending order
// for (int i : stopCodonPositions)
// {
// cout
// }
//string currentGene = fasta.substr(startCodonPositions + 3, stopCodonPositions - (startCodonPositions + 3));
// Frame 1
int pos = 0;
while (2 + pos
{
if(startCodonPositions[0], startCodonPositions[1], startCodonPositions[2])
{
pos += 3;
}
++pos;
}
// Frame 2
pos = 1;
while ( 2 + pos
{
if(startCodonPositions[0], startCodonPositions[1], startCodonPositions[2])
{
//is ATG ...
pos += 3;
}
++pos;
}
// Frame 3
pos = 2;
while ( 2 + pos
{
if(startCodonPositions[0], startCodonPositions[1], startCodonPositions[2])
{
//is ATG ...
pos += 3;
}
++pos;
}
fileStream.close();
//NC_003988_1.fna
return 0;
}
My gene.cpp
int countGenes(string str, char gene)
{
int count = 0, length = 7374; // Counter and string length
for (unsigned i = 0; i
{
if (str[i] == gene)
count++;
}
// See how many times the character repeats
int geneRepetition = length / str.size();
count = (count * geneRepetition);
/* If the length is not a multiple of the string size
check for the remaining repeating characters. */
for (unsigned i = 0; i
{
if (str[i] == gene)
count++;
}
return count;
}
void startCodon(string str, vector& vec, string codon)
{
size_t start = 0;
bool search = true;
while (search)
{
start = str.find(codon, start);
if (start == string::npos)
break;
vec.push_back(start);
start++;
}
}
bool sortAcend(int i, int j)
{
return i
}
Part of the csv file:
>NC_003988.1 Simian enterovirus A, complete genome GAGTGTTCCCACCCAACAGGCCCACTGGGTGTTGTACTCTGGTATTACGGTACCTTTGTACGCCTATTTT ATTTCCCCCCCCTTTTTGAAACTTAGAAGTTAATAATAAACACGCTCACTAGGTGCACTACATCCAGTAG TGTAATGAGCAAGCACTTCTGTCTYCCCCGGGAGGGATATATGGTACGCTGTGCAAACGGCGGAAATTAA TCCTACCGTTAACCGCCCACCTACTCCGAGAAGCCTAGTACCTAATTGGATTTATCAATGGAGTTGCGCT CAGCAGGTGACCCTGACCTGCCAGCTCCGGCTGATGGACCTGGGCTTTCCCCACAGGCGACTGTGGCCCA GGTCGCGTGGCGGCCGGCCCACCCCCCTGGGTGGGACGCCTTGATAATGACAAGGTGGGAAGAGCCTATT GGGCTAGCTGGTTTCCTCCGGCCTCCTGAATGCGGCTAACCTTAACCCCAGAGCATATGGTAGCAACCCA GCTACTAGTATGTCATAATGCGTAAGTCTGGGATGGGACCGACTACTTTGGAGAGTCCGTGTTTCTATTG TTTCTTTAATCAATCTTATGGTGACAATTTATAGTGCCCTGAGTATTGATTGGTTGTTGCTTTTGACAAT TATTGAGACATCACATAGACATAATGGGAGCTCAAGTAAGCAGGCAAACGTCTGGTGCGCATGACACCCG GATACGGGCTGAACAGGGCGCAAACATTCATTATACTAATATCAATTATTATAGAGATGCAGCTAGCAAT GCAGCAAGCAAAATGGACTATTCCCAGGATCCGGACAAGTTCACGAAACCAGTACTTGATGCTATAACTG AACCATTACCCACGCTGAAGTCCCCTAGTGCTGAGGCATGTGGGTACAGCGACCGAGTTGCACAACTGAC AATTGGCAATTCCACTATCACTACTCAGGAAGCCGCCAATGTGGTGGTCGCATATGGACAATGGCCTGAA TATTTAGATTCGAAGGATGCAACTGCCGTGGATAAGCCCACACAGCCCGATGTAGCCTCAAATAGATTTT ACACTCTTAAGACAGTGTCTTGGGAGAAGAGTTCAACTGGCTGGTATTGGAAATTCTCGGATTGTCTGGC TTCTGTTGGATTATTTGGACAGAATGTACAGTATCATTATTTAGGCCGTTATGGGTTAGCGGTTCATGTG CAATGTAATGCTTCAAAATTTCATCAGGGCACTCTACTGGTCTTAGCAATACCAGAATGGGAGATTGGGG TGTCTAATGCTGATAGGGCATCCTTTAATCTAACAAACCCCGATAAGAACGGGCATACTATGACTGGTCA AGAAGCTTATTGCTTACATAATGGGACTAACATCCATTCTTCACTGGTATTTCCACATCAATTCATCAAT CTTAGGACAAACAATTGTGCTACGTTAGTCTTGCCCTATGTGGGAGCAACACCACTGGACACACCGATCA AGCATAATGTTTGGTCATTGGTAGTAATACCGGTGGTCCCGTTGGATTACACCACTGGTGCAACTACACA AGTGCCTATAACAATAACAATGGCTCCAATGGCGTGCGAGTTTAACGGACTGCGCAATGCCATCACCCAA GGGCTGCCAGTACTCAATACACCCGGCTCTGGGCAGTTTGTGACTACAGATAATTTCCAATCACCAAACT TGATTCCAAATTTTGATGTGACACAAGTCTTTAATAGTCCAGGTGAAATTATTAATTTACAGCAGTATGT CCAGATTGAGGGCATTATGGAAATCAATAATGTAGCAAGTGCAAATAATTTGGAGAGAATTCGCATTCCA ATATCAGTCCAGAGTGGAATTGATGAGATGTTATTTGCAATCAACTGCAACCCAGGAACAGCCCAGGAGT TTAGACGCACACCCCTGGGAGATGTGTGTAGGTATTATACACAGTGGTCAGGTAGCATACAAATTACATT TACATTTTGTGGTTCATTTATGACAACAGGAAAATTATTAATTTGCTACACCCCTCCGGGTGGTCGAGTA CCACAAAATAGAGAGGAGGCAATGCTAGGGACTAATGTGATCTGGGATTTTGGTTTACAATCCAGCGTTA CGCTGAACATACCGTGGATAAGTGGAGCCCATTTTAGAAACACTTCTGTTAATGTCGATGGTTTTGATAA CACAGGGTATGTATCTGCTTGGTTTCAAACGAACATGGTAGTTCCTCCCGATGCTCCAACGACTGCTTAT ATATTGGCTTTTACATCAGCCAAGGATGATTTCTCGATGCGCTTGTTGCGGGATACAGCAGAGATTTCGC AAGACGGATTTCTGCAAGGACCAATAGATCAAGCAATAGAAAAAGTAATCACTGATGTAGTGTCTGACAC GCGTGAGTCTAGTAGTGACTTTAGCATTGGGGCTGTTCCAGCATTGAATGCGGTGGAAACTGGAGCCACT TCGCAAGCTAGTGTTGAGTCCACCATTGAGACGCGGGCCGTGCAGAATCGTCATCGCACTTCTGAGATGA GCGTGGAAAGCTTTTTGGGCCGCTCTAGTTTAGTAACTCGCTTTACCATTAATAATGGAGGAACAAATAA TGCCACGAAGTTTCGTAACTGGAAAATAAACTTAAAGGAAGTGGTGCAGCTGCGGCGTAAATTAGAAATG TTTACTTACGTGCGCTTTGATCTTGAGGTGACTATAGTGGCTGTGAATTTGACTGGAAATGGAGGAGTGC GTTACATGTACCAAGCAATGTACTGCCCCCCAGGTGCCCCCCTCCCCACCAATGCTGATCAATATCTGTG GCAATCCTCGACAAATCCCTCCATAATCGGAGCAGTTGGTGAAGTCCCAGGCAGAGTATCAGTGCCTTTT GTGTCAAATGCTAATATGTATGCCACCTTTTATGATGGATATCCATCCTTTGGAAGCATAAATGGACAGG GAAATGGCTCTGATTACGGTGCATTCATACCAAATGATATGGGTACATTGTGTTTCCGATTACTCAATAT CTTTAATAATGGTCCACAAATTCAATTTAGAGTGTTCATGAAACCCAAGCATGTACGAGTATGGTGCCCA
Thank you.
GTALGA CA A LAJ GA A A Frame 1 STOP bin GTAJGACAAJ ATG A A A Frame 2 GTATGACA ATALGA A A Frame 3 Mec STOP Start codon's position ensures that this frame is chosen GTALGA CA A LAJ GA A A Frame 1 STOP bin GTAJGACAAJ ATG A A A Frame 2 GTATGACA ATALGA A A Frame 3 Mec STOP Start codon's position ensures that this frame is chosenStep by Step Solution
There are 3 Steps involved in it
Step: 1
Get Instant Access with AI-Powered Solutions
See step-by-step solutions with expert insights and AI powered tools for academic success
Step: 2
Step: 3
Ace Your Homework with AI
Get the answers you need in no time with our AI-driven, step-by-step assistance
Get Started