Answered step by step

Verified Expert Solution

Link Copied!

Question

1 Approved Answer

Posted on Jul 26, 2024

I need help completing the lexer.c file. There are a few files included in the main lexer.c file which are included below. All of the

I need help completing the lexer.c file. There are a few files included in the main lexer.c file which are included below. All of the files can also be accessed at the github link below.

https://github.com/anon7257/Compiler

This is part of a larger project to build a compiler. All of the code still needed is indicated with a TODO comment and a string of *s. The images are the instructions for the project.

image text in transcribed

****************************************************************************

lexer.c

****************************************************************************

#include

#include "token.h"

#include "lexer.h"

#include "lexer_reserved.c"

struct Token** lexer(FILE *lexerin) {

int token_list_i = 0;

struct Token** token_list = alloc_token_list();

char c;

c = fgetc(lexerin);

while (true) {

char buffer[MAX_LEXEME_LEN];

int buffer_i = 0;

if (token_list_i > MAX_TOKENS) {

error(0, 0, "exceeded maximum tokens");

exit(1);

}

if (isalpha(c) || '_' == c) {

// TODO: fill in the identifier/reserved word lexer ********************

// TODO: buffer an identifier (don't forget the \0 sentinel!) ********************

// hash function to match reserved words is given

struct reserved_type *reserved = lexer_reserved(buffer, strlen(buffer));

if (NULL != reserved) {

// reserved word

token_list[token_list_i++] = new_token(reserved->type);

} else {

// TODO: create token for identifier with its lexeme ********************

}

} else if (isdigit(c)) {

while(isdigit(c) && buffer_i

buffer[buffer_i++] = c;

c = fgetc(lexerin);

}

buffer[bugger_i++] = '\0';

token_list[token_list_i++] = new_number(NUMBER,ATOI(buffer));

} else if (ispunct(c)) {

// TODO: fill in the rest of the punctuation-based lexemes ********************

if ('#' == c) { // given

while (' ' != (c = fgetc(lexerin)));

} else if ('=' == c) { // given

token_list[token_list_i++] = new_token(EQ);

c = fgetc(lexerin);

} else if (',' == c) { // given

token_list[token_list_i++] = new_token(COMMA);

c = fgetc(lexerin);

// TODO: lots more else ifs here ********************

} else {

error(0, 0, "invalid token");

exit(1);

}

} else if (isspace(c)) {

c = fgetc(lexerin);

} else if (c == EOF) {

token_list[token_list_i++] = new_token(EOT);

break;

}

return token_list;

}

****************************************************************************

token.h

****************************************************************************

#ifndef TOKEN_H

#define TOKEN_H

#include

#define MAX_LEXEME_LEN 512

#define MAX_TOKENS 4096

typedef enum tok_kind Type;

enum tok_kind {

VAR = 1,

FUNC,

INT,

BOOL,

RETURN,

BEGIN,

END,

IF,

THEN,

ELSE,

WHILE,

DO,

READ,

WRITE,

TRUE,

FALSE,

OR,

AND,

NOT,

EQ,

COMMA,

COLON,

ASSIGN,

PLUS,

MINUS,

MULT,

DIV,

MOD,

LPAREN,

RPAREN,

LT,

LTE,

GT,

GTE,

NEQ,

IDENT,

NUMBER,

EOT,

};

struct Token {

enum tok_kind kind;

union {

char *identifier; // only when kind == IDENT

int number; // only when kind == NUMBER

};

struct Token *alloc_token();

struct Token** alloc_token_list();

struct Token* new_identifier(int kind, char *lexeme);

struct Token* new_number(int kind, int number);

struct Token* new_token(int kind);

struct Token *alloc_token();

struct Token** alloc_token_list();

void write_tokens(FILE *, struct Token**);

void print_token_kind(FILE *, enum tok_kind);

void print_token(FILE *, struct Token*);

struct Token** read_tokens(FILE *);

#endif

****************************************************************************

lexer.h

****************************************************************************

#ifndef LEXER_H

#define LEXER_H

#include

struct Token** lexer(FILE *lexerin);

#endif

****************************************************************************

lexer_reserved.c

****************************************************************************

/* ANSI-C code produced by gperf version 3.1 */

/* Command-line: gperf -N lexer_reserved -t lexer_reserved.gperf */

/* Computed positions: -k'1-2' */

#if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \

&& ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \

&& (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \

&& ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \

&& ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \

&& ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \

&& ('9' == 57) && (':' == 58) && (';' == 59) && ('

&& ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \

&& ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \

&& ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \

&& ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \

&& ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \

&& ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \

&& ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \

&& ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \

&& ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \

&& ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \

&& ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \

&& ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \

&& ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \

&& ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \

&& ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \

&& ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126))

/* The character set is not based on ISO-646. */

#error "gperf generated tables don't work with this execution character set. Please report a bug to ."

#endif

#line 1 "lexer_reserved.gperf"

struct reserved_type { char *name; int type; };

#define TOTAL_KEYWORDS 19

#define MIN_WORD_LENGTH 2

#define MAX_WORD_LENGTH 8

#define MIN_HASH_VALUE 2

#define MAX_HASH_VALUE 28

/* maximum key range = 27, duplicates = 0 */

#ifdef __GNUC__

__inline

#else

#ifdef __cplusplus

inline

#endif

static unsigned int

hash (register const char *str, register size_t len)

{

static unsigned char asso_values[] =

{

29, 29, 29, 29, 29, 29, 29, 29, 29, 29,

29, 29, 29, 29, 29, 29, 29, 15, 15, 29,

5, 5, 0, 29, 10, 10, 29, 29, 15, 29,

0, 0, 29, 29, 0, 29, 0, 20, 5, 0,

29, 29, 29, 29, 29, 29, 29, 29, 29, 29,

29, 29, 29, 29, 29, 29

};

return len + asso_values[(unsigned char)str[1]] + asso_values[(unsigned char)str[0]];

}

struct reserved_type *

lexer_reserved (register const char *str, register size_t len)

{

static struct reserved_type wordlist[] =

{

{""}, {""},

#line 19 "lexer_reserved.gperf"

{"or", OR},

#line 21 "lexer_reserved.gperf"

{"not", NOT},

#line 17 "lexer_reserved.gperf"

{"true", TRUE},

#line 16 "lexer_reserved.gperf"

{"write", WRITE},

{""},

#line 14 "lexer_reserved.gperf"

{"do", DO},

#line 9 "lexer_reserved.gperf"

{"end", END},

#line 15 "lexer_reserved.gperf"

{"read", READ},

{""},

#line 7 "lexer_reserved.gperf"

{"return", RETURN},

#line 10 "lexer_reserved.gperf"

{"if", IF},

#line 5 "lexer_reserved.gperf"

{"int", INT},

#line 11 "lexer_reserved.gperf"

{"then", THEN},

#line 13 "lexer_reserved.gperf"

{"while", WHILE},

{""}, {""},

#line 20 "lexer_reserved.gperf"

{"and", AND},

#line 6 "lexer_reserved.gperf"

{"bool", BOOL},

#line 18 "lexer_reserved.gperf"

{"false", FALSE},

{""}, {""},

#line 3 "lexer_reserved.gperf"

{"var", VAR},

#line 12 "lexer_reserved.gperf"

{"else", ELSE},

#line 8 "lexer_reserved.gperf"

{"begin", BEGIN},

{""}, {""},

#line 4 "lexer_reserved.gperf"

{"function", FUNC}

};

if (len = MIN_WORD_LENGTH)

{

if (key

{

if (*str == *s && !strcmp (str + 1, s + 1))

return &wordlist[key];

}

return 0;

}

Grammar Specification The following is the grammar specification for a variant of PL/O, specified in EBNF notation The lexical specification defines the tokens of the language and their respective lexemes. The EBNF notation for each token is equivalent to a regular expression The grammar defines the syntax of the language. Tokens are represented in all caps, e.g., IDENT. The parser takes as input a stream of tokens and produces an AST according to the grammar, with tokens as the leaves of the tree Grammar // EBNF notation tips Lowercase symbols are nonterminals UPPERCASE symbols are terminals (tokens from your lexer) = denotes a production means 0 or more 1 means 0 or 1 just groups symbols // declarations program block vardecls funcdelcs ::= { FUNC IDENT LPAREN [ formals ] RPAREN [ COLON type ] block } formals type : := block ::vardecls funcdecls statement :VAR IDENT COLON type h ::- IDENT COLON type { COMMA IDENT COLON type } :INT BOOL // statements statement :IDENT ASSIGN expr | IDENT LPAREN exprlist RPAREN RETURN expr | BEGIN statement END | IF expr THEN statement ELSE statement 1 | WHILE expr DO statement I READ IDENT I WRITE expr exprlist : expr COMMA expr > 1 // expressions expr simpleexpr term [ termop term ] term factor :simpleexpr relop simpleexpr ] ::factor factorop factor] ::= IDENT [ LPAREN exprlist RPAREN ] 1 NUMBER 1 TRUE I FALSE I LPAREN expr RPAREN I NOT facto // operations at each precedence of the expression relop termop factorop :MULT I DIV I MOD I AND ::- PLUS I MINUS | 0R Lexical Specification // reserved words VAR FUNC ::"function" INT BOOL "bool" RETURN ::- "return" BEGIN ::- "begin" END IF THEN ::- ELSE ::- "else" WHILE ;-"while" DO READ WRITE :"-"write" TRUE FALSE "false" OR AND NOT ::- "var" ::"int" "end" :"if" "then" "do" :-"read" ::- "true" "or "and" "not" // punctuation EQ COMMA ::-"," COLON ::- ":" ASSIGN :-"-" PLUS MINUS ::- "-" MULT DIV MOD LPAREN ::= "(" LT LTE GT GTE NEQ // tokens with lexemes IDENT ;:-( letter i . .. ) { ( letter I digit l "-" ) } NUMBER ::= digit { digit } // language definitions letter ::= [A-Za-z] digit[0-91 for lexemes Comments Comments are indicated by a hash # symbol and extend from the symbol to the end of the line. Comments are implemented for you already Lexer Use the follow C standard functions to identify the character class of a single character. isalpha - is a letter isalnum - is a letter or number isdigit - is a number ispunct - is a punctuation character, i.e., not a letter or digit but still printable Use the following idioms to build up the list of tokens. new_token and new_identifier are helper methods defined in token.c that you may use to construct the correct Token struct. token_list [token_list_i++]new_token(EQ) token_listltoken_list_i+-new_identifier (IDENT, buffer): Use atoi to create the value for a NUMBER token. token_listltoken_list_i++new_number (NUMBER, atoi(buffer)); Use the following idiom to save a lexeme during lexing. buffer[buffer_i++-c cfgetc(lexerin); Debugging If your lexer is hanging, make sure the input is being advanced with fgetc Grammar Specification The following is the grammar specification for a variant of PL/O, specified in EBNF notation The lexical specification defines the tokens of the language and their respective lexemes. The EBNF notation for each token is equivalent to a regular expression The grammar defines the syntax of the language. Tokens are represented in all caps, e.g., IDENT. The parser takes as input a stream of tokens and produces an AST according to the grammar, with tokens as the leaves of the tree Grammar // EBNF notation tips Lowercase symbols are nonterminals UPPERCASE symbols are terminals (tokens from your lexer) = denotes a production means 0 or more 1 means 0 or 1 just groups symbols // declarations program block vardecls funcdelcs ::= { FUNC IDENT LPAREN [ formals ] RPAREN [ COLON type ] block } formals type : := block ::vardecls funcdecls statement :VAR IDENT COLON type h ::- IDENT COLON type { COMMA IDENT COLON type } :INT BOOL // statements statement :IDENT ASSIGN expr | IDENT LPAREN exprlist RPAREN RETURN expr | BEGIN statement END | IF expr THEN statement ELSE statement 1 | WHILE expr DO statement I READ IDENT I WRITE expr exprlist : expr COMMA expr > 1 // expressions expr simpleexpr term [ termop term ] term factor :simpleexpr relop simpleexpr ] ::factor factorop factor] ::= IDENT [ LPAREN exprlist RPAREN ] 1 NUMBER 1 TRUE I FALSE I LPAREN expr RPAREN I NOT facto // operations at each precedence of the expression relop termop factorop :MULT I DIV I MOD I AND ::- PLUS I MINUS | 0R Lexical Specification // reserved words VAR FUNC ::"function" INT BOOL "bool" RETURN ::- "return" BEGIN ::- "begin" END IF THEN ::- ELSE ::- "else" WHILE ;-"while" DO READ WRITE :"-"write" TRUE FALSE "false" OR AND NOT ::- "var" ::"int" "end" :"if" "then" "do" :-"read" ::- "true" "or "and" "not" // punctuation EQ COMMA ::-"," COLON ::- ":" ASSIGN :-"-" PLUS MINUS ::- "-" MULT DIV MOD LPAREN ::= "(" LT LTE GT GTE NEQ // tokens with lexemes IDENT ;:-( letter i . .. ) { ( letter I digit l "-" ) } NUMBER ::= digit { digit } // language definitions letter ::= [A-Za-z] digit[0-91 for lexemes Comments Comments are indicated by a hash # symbol and extend from the symbol to the end of the line. Comments are implemented for you already Lexer Use the follow C standard functions to identify the character class of a single character. isalpha - is a letter isalnum - is a letter or number isdigit - is a number ispunct - is a punctuation character, i.e., not a letter or digit but still printable Use the following idioms to build up the list of tokens. new_token and new_identifier are helper methods defined in token.c that you may use to construct the correct Token struct. token_list [token_list_i++]new_token(EQ) token_listltoken_list_i+-new_identifier (IDENT, buffer): Use atoi to create the value for a NUMBER token. token_listltoken_list_i++new_number (NUMBER, atoi(buffer)); Use the following idiom to save a lexeme during lexing. buffer[buffer_i++-c cfgetc(lexerin); Debugging If your lexer is hanging, make sure the input is being advanced with fgetc