Question
I need help completing the lexer.c file. There are a few files included in the main lexer.c file which are included below. All of the
I need help completing the lexer.c file. There are a few files included in the main lexer.c file which are included below. All of the files can also be accessed at the github link below.
https://github.com/anon7257/Compiler
This is part of a larger project to build a compiler. All of the code still needed is indicated with a TODO comment and a string of *s. The images are the instructions for the project.
****************************************************************************
lexer.c
****************************************************************************
#include
#include
#include
#include
#include
#include
#include "token.h"
#include "lexer.h"
#include "lexer_reserved.c"
struct Token** lexer(FILE *lexerin) {
int token_list_i = 0;
struct Token** token_list = alloc_token_list();
char c;
c = fgetc(lexerin);
while (true) {
char buffer[MAX_LEXEME_LEN];
int buffer_i = 0;
if (token_list_i > MAX_TOKENS) {
error(0, 0, "exceeded maximum tokens");
exit(1);
}
if (isalpha(c) || '_' == c) {
// TODO: fill in the identifier/reserved word lexer ********************
// TODO: buffer an identifier (don't forget the \0 sentinel!) ********************
// hash function to match reserved words is given
struct reserved_type *reserved = lexer_reserved(buffer, strlen(buffer));
if (NULL != reserved) {
// reserved word
token_list[token_list_i++] = new_token(reserved->type);
} else {
// TODO: create token for identifier with its lexeme ********************
}
} else if (isdigit(c)) {
while(isdigit(c) && buffer_i
buffer[buffer_i++] = c;
c = fgetc(lexerin);
}
buffer[bugger_i++] = '\0';
token_list[token_list_i++] = new_number(NUMBER,ATOI(buffer));
} else if (ispunct(c)) {
// TODO: fill in the rest of the punctuation-based lexemes ********************
if ('#' == c) { // given
while (' ' != (c = fgetc(lexerin)));
} else if ('=' == c) { // given
token_list[token_list_i++] = new_token(EQ);
c = fgetc(lexerin);
} else if (',' == c) { // given
token_list[token_list_i++] = new_token(COMMA);
c = fgetc(lexerin);
// TODO: lots more else ifs here ********************
} else {
error(0, 0, "invalid token");
exit(1);
}
} else if (isspace(c)) {
c = fgetc(lexerin);
} else if (c == EOF) {
token_list[token_list_i++] = new_token(EOT);
break;
}
}
return token_list;
}
****************************************************************************
token.h
****************************************************************************
#ifndef TOKEN_H
#define TOKEN_H
#include
#define MAX_LEXEME_LEN 512
#define MAX_TOKENS 4096
typedef enum tok_kind Type;
enum tok_kind {
VAR = 1,
FUNC,
INT,
BOOL,
RETURN,
BEGIN,
END,
IF,
THEN,
ELSE,
WHILE,
DO,
READ,
WRITE,
TRUE,
FALSE,
OR,
AND,
NOT,
EQ,
COMMA,
COLON,
ASSIGN,
PLUS,
MINUS,
MULT,
DIV,
MOD,
LPAREN,
RPAREN,
LT,
LTE,
GT,
GTE,
NEQ,
IDENT,
NUMBER,
EOT,
};
struct Token {
enum tok_kind kind;
union {
char *identifier; // only when kind == IDENT
int number; // only when kind == NUMBER
};
};
struct Token *alloc_token();
struct Token** alloc_token_list();
struct Token* new_identifier(int kind, char *lexeme);
struct Token* new_number(int kind, int number);
struct Token* new_token(int kind);
struct Token *alloc_token();
struct Token** alloc_token_list();
void write_tokens(FILE *, struct Token**);
void print_token_kind(FILE *, enum tok_kind);
void print_token(FILE *, struct Token*);
struct Token** read_tokens(FILE *);
#endif
****************************************************************************
lexer.h
****************************************************************************
#ifndef LEXER_H
#define LEXER_H
#include
struct Token** lexer(FILE *lexerin);
#endif
****************************************************************************
lexer_reserved.c
****************************************************************************
/* ANSI-C code produced by gperf version 3.1 */
/* Command-line: gperf -N lexer_reserved -t lexer_reserved.gperf */
/* Computed positions: -k'1-2' */
#if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \
&& ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \
&& (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \
&& ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \
&& ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \
&& ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \
&& ('9' == 57) && (':' == 58) && (';' == 59) && ('
&& ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \
&& ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \
&& ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \
&& ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \
&& ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \
&& ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \
&& ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \
&& ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \
&& ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \
&& ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \
&& ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \
&& ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \
&& ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \
&& ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \
&& ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \
&& ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126))
/* The character set is not based on ISO-646. */
#error "gperf generated tables don't work with this execution character set. Please report a bug to ."
#endif
#line 1 "lexer_reserved.gperf"
struct reserved_type { char *name; int type; };
#define TOTAL_KEYWORDS 19
#define MIN_WORD_LENGTH 2
#define MAX_WORD_LENGTH 8
#define MIN_HASH_VALUE 2
#define MAX_HASH_VALUE 28
/* maximum key range = 27, duplicates = 0 */
#ifdef __GNUC__
__inline
#else
#ifdef __cplusplus
inline
#endif
#endif
static unsigned int
hash (register const char *str, register size_t len)
{
static unsigned char asso_values[] =
{
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 15, 15, 29,
5, 5, 0, 29, 10, 10, 29, 29, 15, 29,
0, 0, 29, 29, 0, 29, 0, 20, 5, 0,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29
};
return len + asso_values[(unsigned char)str[1]] + asso_values[(unsigned char)str[0]];
}
struct reserved_type *
lexer_reserved (register const char *str, register size_t len)
{
static struct reserved_type wordlist[] =
{
{""}, {""},
#line 19 "lexer_reserved.gperf"
{"or", OR},
#line 21 "lexer_reserved.gperf"
{"not", NOT},
#line 17 "lexer_reserved.gperf"
{"true", TRUE},
#line 16 "lexer_reserved.gperf"
{"write", WRITE},
{""},
#line 14 "lexer_reserved.gperf"
{"do", DO},
#line 9 "lexer_reserved.gperf"
{"end", END},
#line 15 "lexer_reserved.gperf"
{"read", READ},
{""},
#line 7 "lexer_reserved.gperf"
{"return", RETURN},
#line 10 "lexer_reserved.gperf"
{"if", IF},
#line 5 "lexer_reserved.gperf"
{"int", INT},
#line 11 "lexer_reserved.gperf"
{"then", THEN},
#line 13 "lexer_reserved.gperf"
{"while", WHILE},
{""}, {""},
#line 20 "lexer_reserved.gperf"
{"and", AND},
#line 6 "lexer_reserved.gperf"
{"bool", BOOL},
#line 18 "lexer_reserved.gperf"
{"false", FALSE},
{""}, {""},
#line 3 "lexer_reserved.gperf"
{"var", VAR},
#line 12 "lexer_reserved.gperf"
{"else", ELSE},
#line 8 "lexer_reserved.gperf"
{"begin", BEGIN},
{""}, {""},
#line 4 "lexer_reserved.gperf"
{"function", FUNC}
};
if (len = MIN_WORD_LENGTH)
{
register unsigned int key = hash (str, len);
if (key
{
register const char *s = wordlist[key].name;
if (*str == *s && !strcmp (str + 1, s + 1))
return &wordlist[key];
}
}
return 0;
}
Grammar Specification The following is the grammar specification for a variant of PL/O, specified in EBNF notation The lexical specification defines the tokens of the language and their respective lexemes. The EBNF notation for each token is equivalent to a regular expression The grammar defines the syntax of the language. Tokens are represented in all caps, e.g., IDENT. The parser takes as input a stream of tokens and produces an AST according to the grammar, with tokens as the leaves of the tree Grammar // EBNF notation tips Lowercase symbols are nonterminals UPPERCASE symbols are terminals (tokens from your lexer) = denotes a production means 0 or more 1 means 0 or 1 just groups symbols // declarations program block vardecls funcdelcs ::= { FUNC IDENT LPAREN [ formals ] RPAREN [ COLON type ] block } formals type : := block ::vardecls funcdecls statement :VAR IDENT COLON type h ::- IDENT COLON type { COMMA IDENT COLON type } :INT BOOL // statements statement :IDENT ASSIGN expr | IDENT LPAREN exprlist RPAREN RETURN expr | BEGIN statement END | IF expr THEN statement ELSE statement 1 | WHILE expr DO statement I READ IDENT I WRITE expr exprlist : expr COMMA expr > 1 // expressions expr simpleexpr term [ termop term ] term factor :simpleexpr relop simpleexpr ] ::factor factorop factor] ::= IDENT [ LPAREN exprlist RPAREN ] 1 NUMBER 1 TRUE I FALSE I LPAREN expr RPAREN I NOT facto // operations at each precedence of the expression relop termop factorop :MULT I DIV I MOD I AND ::- PLUS I MINUS | 0R Lexical Specification // reserved words VAR FUNC ::"function" INT BOOL "bool" RETURN ::- "return" BEGIN ::- "begin" END IF THEN ::- ELSE ::- "else" WHILE ;-"while" DO READ WRITE :"-"write" TRUE FALSE "false" OR AND NOT ::- "var" ::"int" "end" :"if" "then" "do" :-"read" ::- "true" "or "and" "not" // punctuation EQ COMMA ::-"," COLON ::- ":" ASSIGN :-"-" PLUS MINUS ::- "-" MULT DIV MOD LPAREN ::= "(" LT LTE GT GTE NEQ // tokens with lexemes IDENT ;:-( letter i . .. ) { ( letter I digit l "-" ) } NUMBER ::= digit { digit } // language definitions letter ::= [A-Za-z] digit[0-91 for lexemes Comments Comments are indicated by a hash # symbol and extend from the symbol to the end of the line. Comments are implemented for you already Lexer Use the follow C standard functions to identify the character class of a single character. isalpha - is a letter isalnum - is a letter or number isdigit - is a number ispunct - is a punctuation character, i.e., not a letter or digit but still printable Use the following idioms to build up the list of tokens. new_token and new_identifier are helper methods defined in token.c that you may use to construct the correct Token struct. token_list [token_list_i++]new_token(EQ) token_listltoken_list_i+-new_identifier (IDENT, buffer): Use atoi to create the value for a NUMBER token. token_listltoken_list_i++new_number (NUMBER, atoi(buffer)); Use the following idiom to save a lexeme during lexing. buffer[buffer_i++-c cfgetc(lexerin); Debugging If your lexer is hanging, make sure the input is being advanced with fgetc Grammar Specification The following is the grammar specification for a variant of PL/O, specified in EBNF notation The lexical specification defines the tokens of the language and their respective lexemes. The EBNF notation for each token is equivalent to a regular expression The grammar defines the syntax of the language. Tokens are represented in all caps, e.g., IDENT. The parser takes as input a stream of tokens and produces an AST according to the grammar, with tokens as the leaves of the tree Grammar // EBNF notation tips Lowercase symbols are nonterminals UPPERCASE symbols are terminals (tokens from your lexer) = denotes a production means 0 or more 1 means 0 or 1 just groups symbols // declarations program block vardecls funcdelcs ::= { FUNC IDENT LPAREN [ formals ] RPAREN [ COLON type ] block } formals type : := block ::vardecls funcdecls statement :VAR IDENT COLON type h ::- IDENT COLON type { COMMA IDENT COLON type } :INT BOOL // statements statement :IDENT ASSIGN expr | IDENT LPAREN exprlist RPAREN RETURN expr | BEGIN statement END | IF expr THEN statement ELSE statement 1 | WHILE expr DO statement I READ IDENT I WRITE expr exprlist : expr COMMA expr > 1 // expressions expr simpleexpr term [ termop term ] term factor :simpleexpr relop simpleexpr ] ::factor factorop factor] ::= IDENT [ LPAREN exprlist RPAREN ] 1 NUMBER 1 TRUE I FALSE I LPAREN expr RPAREN I NOT facto // operations at each precedence of the expression relop termop factorop :MULT I DIV I MOD I AND ::- PLUS I MINUS | 0R Lexical Specification // reserved words VAR FUNC ::"function" INT BOOL "bool" RETURN ::- "return" BEGIN ::- "begin" END IF THEN ::- ELSE ::- "else" WHILE ;-"while" DO READ WRITE :"-"write" TRUE FALSE "false" OR AND NOT ::- "var" ::"int" "end" :"if" "then" "do" :-"read" ::- "true" "or "and" "not" // punctuation EQ COMMA ::-"," COLON ::- ":" ASSIGN :-"-" PLUS MINUS ::- "-" MULT DIV MOD LPAREN ::= "(" LT LTE GT GTE NEQ // tokens with lexemes IDENT ;:-( letter i . .. ) { ( letter I digit l "-" ) } NUMBER ::= digit { digit } // language definitions letter ::= [A-Za-z] digit[0-91 for lexemes Comments Comments are indicated by a hash # symbol and extend from the symbol to the end of the line. Comments are implemented for you already Lexer Use the follow C standard functions to identify the character class of a single character. isalpha - is a letter isalnum - is a letter or number isdigit - is a number ispunct - is a punctuation character, i.e., not a letter or digit but still printable Use the following idioms to build up the list of tokens. new_token and new_identifier are helper methods defined in token.c that you may use to construct the correct Token struct. token_list [token_list_i++]new_token(EQ) token_listltoken_list_i+-new_identifier (IDENT, buffer): Use atoi to create the value for a NUMBER token. token_listltoken_list_i++new_number (NUMBER, atoi(buffer)); Use the following idiom to save a lexeme during lexing. buffer[buffer_i++-c cfgetc(lexerin); Debugging If your lexer is hanging, make sure the input is being advanced with fgetcStep by Step Solution
There are 3 Steps involved in it
Step: 1
Get Instant Access to Expert-Tailored Solutions
See step-by-step solutions with expert insights and AI powered tools for academic success
Step: 2
Step: 3
Ace Your Homework with AI
Get the answers you need in no time with our AI-driven, step-by-step assistance
Get Started