一个简单的C语言词法分析与语法分析器【原】

sabolasi

浏览: 907070 次

最近访客更多访客>>

u012363178

jklwan

wangyy

wwssmm00220033

博主相关

博客

微博

相册

留言

关于我

文章分类

全部博客 (1295)

社区版块

存档分类

2011-12 ( 21)
2011-11 ( 35)
2011-10 ( 34)
更多存档...

词法分析
可识别内容:
标识符：id
数字：num
关键字：int,char,if,else,while,do,for
标号：, ， . ， ;
算术运算符号：=，+，-，*，/，&，!，|，&&，||
关系运算符：<，<=，>=，>，==，!=
注释：//

内码定义：
单个符号，如{，+，*，> 等，均使用其ascii码做内码，占双或多个字节的符号（包括保留字，标号，数字，运算符等）为其取名如下：
Enum { END=0,INT,CHAR,IF,ELSE,WHILE=5,
DO,FOR,ARGAND,ARGOR,NUM=10,
ID,LESSEQUAL,EQUAL,GREATEQUAL,NOTEQUAL=15 };
其中NUM代表数字，ID代表标识符.

测试程序1-1的词法分析结果如下：

内码表

123{
11X
61=
1012
43+
43+
11b
47/
1013
45-
105
42*
104
42*
109
59;
11Y
61=

104
42*
101024
59;
3if
40(
11X
14>=
11Y
41)
123{
3if40(
11i
13==
11i

41)
123{
11X
61=
11Y
125}
125}
59;
59;
59;
59;
5while
40(
11X
60<

11Y
41)
123{
11X
61=
11X
43+
101
59;
125}
125}

语法分析
C语言子集，可支持
语句块，语句，条件语句，While循环语句，赋值语句，基本算术表达式等。例如：
{
// Comment Supported : This is only a Test ^_^

X = 12 + b / 13 - 5 * 4 * 9;// A AssignmentStatement
Y = 4 * 1024;
if( X >= Y){
if( i == i){// This is nested if Statement
X=Y;
}
}
;;;;// This is Null Statement
while( X < Y){// This is while Statement
X = X +1;
}
}
测试程序1-1

支持错误检测，如将上面例子中X = 12 + b / 13 - 5 * 4 * 9;
故意修改为：X = 12 ++ b / 13 - 5 * 4 * 9; 则会出现如下错误提示，指示了出错行数和行内偏移位置：

规则如下：
<StatementBlock> ::= '{'<StatementSequence>'}'
<StatementSequence> ::= {<NullStatement>|<CommonStatement>|<VariantStatement>}

<AssignmentStatement> ::= ID=<Expression>
<ConditionStatement> ::= if(<Condition> <StatementBlock>
<LoopWhileStatement> ::= while(<Condition> <StatementBlock>

<Condition> ::= <Expression><RelationOperator><Expression>
<Expression> ::= <Item>{+<Item>|-<Item>}
<Item> ::= <Factor>{*<Factor>|/<Factor>}
<Factor> ::= ID|NUM|(<Expression>)
<RelationOperator> ::= <|<=|>=|>|==|!=

//非终结符的英文定义
void StatementBlock();//语句块
void StatementSequence();//语句串

// XxxxxStatement() 为三类语句
void NullStatement();//空语句--仅仅含有一个;号
void CommonStatement();//语句
void VariantStatement();//变种语句--包括 if(){},while{},他们都不以;结尾

// 下面的属于CommonStatement
void AssignmentStatement();//赋值语句

// 下面两种属于VariantStatement
void ConditionStatement();//条件语句
void LoopWhileStatement();//while循环语句

void Condition();//条件
void Expression();//表达式
void Item();//项
void Factor();//因子
void RelationOperator();//关系运算符

不能支持的主要方面：函数调用的识别，逗号表达式，for循环，switch语句。

词法分析：

// LexAly.cpp:C子集词法分析程序

支持内容:

标识符:id

关键字:int,char,if,else,while,do,for

标号:,,.,;

算术运算符号:=,+,-,&,!,|,&&,||

全局字符串:

instr记录待解析的字符串

string存贮当前被解析到的ID

接口:

gettoken();

Sample:

输入:

instr=

for(i=0;i<10;i++){

j=i+10;

printf("%d",j);

}

输出:

for

(

……

}

注意:

要记得处理程序中未显示出来的符号，如空白('')，回车(' ')，值表符(' ')

#include " stdafx.h "

#include < ctype.h >

#include < stdlib.h >

#include < string.h >

#include " Constant.h "

extern void grammar_check();

// 注意：这里累计数目(最大值)不能大于32(100000B)TOKEN

// enum{END=0,INT=1,CHAR,IF,ELSE,WHILE,DO,FOR,ARGAND,ARGOR,NUM,ID};

char index[][ 20 ] = {

{ " ENDOFFile " } , /* 0END */

{ " int " } , /* 1INT */

{ " char " } , /* 2CHAR */

{ " if " } , /* 3IF */

{ " else " } , /* 4ELSE */

{ " while " } , /* 5WHILE */

{ " do " } , /* 6DO */

{ " for " } , /* 7FOR */

{ " && " } , /* 8ARGAND */

{ " || " } , /* 9ARGOR */

{ "" } , /* 10NUM */

{ "" } , /* 11ID */

{ " <= " } , /* 12LESSEQUAL */

{ " == " } , /* 13EQUAL */

{ " >= " } , /* 14GREATEQUAL */

{ " != " } , /* 15NOTEQUAL */

{ "" } /* 16ID */

} ;

char input[ 10000 ] = { 0 } ;

char * instr = input;

char * const start_of_instr = input;

// string包含gettoken最新得到的id等串

// gym包含gettoken得到的内容的代号

// current_line包含当前行号

char string[MAX_INDENT];

int sym;

int current_line = 1 ;

int start_pos_of_current_line;

char * strstart; // 用于辅助识别num,id

int gettoken();

int _gettoken();

void error( char * cur);

char * getlinestring( int line, char * in_buf);

int nextline();

int getline();

int getcurrentpos();

int nextline() { return ++ current_line;}

int getline() { return current_line;}

int getcurrentpos() { return ( int )instr;}

char * getlinestring( int line, char * in_buf)

{

char * t = input;

int i = 1 ;

while ( * t != 0 && i < line) {

if ( * t == ' ' )i ++ ;

t ++ ;

}

int len = 0 ;

while ( * t != ' ' ) {

in_buf[len] = * t;

len ++ ;

t ++ ;

}

in_buf[len] = 0 ;

return in_buf;

}

void error( char * cur)

{

printf( " SpellErrorfoundatline%d " ,getline());

exit( 0 );

}

// 语法分析

int main_grammar( char * filename)

{

int i;

FILE * f;

if ( ! (f = fopen(filename, " r " ))) {

printf( " Failtoopensourcefile%s! " ,filename);

exit( 0 );

}

int k = 0 ;

char c;

while ((c = fgetc(f)) != EOF)

{

input[k] = c;

k ++ ;

}

input[k] = 0 ;

// 打印出程序

printf( " %s " ,start_of_instr);

// 开始语法检查

grammar_check();

printf( " Success! " );

return 0 ;

}

// 词法分析

int main_spell( char * filename)

{

int i;

FILE * f;

if ( ! (f = fopen(filename, " r " ))) {

printf( " Failtoopensourcefile%s! " ,filename);

exit( 0 );

}

int k = 0 ;

char c;

while ((c = fgetc(f)) != EOF)

{

input[k] = c;

k ++ ;

}

input[k] = 0 ;

printf( " %s " ,start_of_instr);

while ((i = gettoken()) != END)

{

if (i == ID) {

printf( " %d %s " ,i,string);

continue ;

}

if (i == NUM) {

printf( " %d %s " ,i,string);

continue ;

}

if (i < 20 ) {

printf( " %d %s " ,i,index[i]);

} else {

printf( " %d %c " ,i,i);

}

return 0 ;

}

int gettoken()

{

int i = (sym = _gettoken());

# if 0

if (i == ID) {

printf( " %s " ,string);

}

if (i == NUM) {

printf( " %s " ,string);

}

if (i < 20 ) {

printf( " %s " ,index[i]);

} else {

printf( " %c " ,i);

}

#endif

return sym;

}

int _gettoken()

{

char * cp = instr;

for (;;) {

if ( * instr == 0 )

return END;

if(可能读入的字符>当前可用缓冲区大小)

扩展缓冲区

// int,char,if,else,while,do,for

switch ( * instr)

{

case ' i ' :

if (instr[ 1 ] == ' f ' && notda(instr[ 2 ]))

{

instr += 2 ; return IF;

}

if (instr[ 1 ] == ' n ' && instr[ 2 ] == ' t ' && notda(instr[ 3 ]))

{

instr += 3 ; return INT;

}

// notakeyword.butanid.

strstart = instr;

instr ++ ;

goto id_label;

case ' c ' :

if (instr[ 1 ] == ' h ' && instr[ 2 ] == ' a ' && instr[ 3 ] == ' r ' && notda(instr[ 4 ]))

{instr += 4 ; return CHAR;}

strstart = instr;

instr ++ ;

goto id_label;

break ;

case ' e ' :

if (instr[ 1 ] == ' l ' && instr[ 2 ] == ' s ' && instr[ 3 ] == ' e ' && notda(instr[ 4 ]))

{instr += 4 ; return ELSE;}

strstart = instr;

instr ++ ;

goto id_label;

break ;

case ' w ' :

if (instr[ 1 ] == ' h ' && instr[ 2 ] == ' i ' && instr[ 3 ] == ' l ' && instr[ 4 ] == ' e ' && notda(instr[ 5 ]))

{instr += 5 ; return WHILE;}

strstart = instr;

instr ++ ;

goto id_label;

case ' d ' :

if (instr[ 1 ] == ' o ' && notda(instr[ 4 ]))

{instr += 2 ; return DO;}

strstart = instr;

instr ++ ;

goto id_label;

case ' f ' :

if (instr[ 1 ] == ' o ' && instr[ 2 ] == ' r ' && notda(instr[ 3 ]))

{instr += 3 ; return FOR;}

strstart = instr;

instr ++ ;

goto id_label;

// dealwithIDs.

// EXCLUDE:i,c,d,e,w,f

case ' a ' :; case ' b ' :;

case ' g ' :; case ' h ' :;

case ' j ' :; case ' k ' :; case ' l ' :;

case ' m ' :; case ' n ' :; case ' o ' :;

case ' p ' :; case ' q ' :; case ' r ' :;

case ' s ' :; case ' t ' :; case ' u ' :;

case ' v ' :; case ' x ' :;

case ' y ' :; case ' z ' :;

case ' A ' :; case ' B ' :;

case ' C ' :; case ' D ' :; case ' E ' :;

case ' F ' :; case ' G ' :; case ' H ' :;

case ' I ' :; case ' J ' :; case ' K ' :;

case ' L ' :; case ' M ' :; case ' N ' :;

case ' O ' :; case ' P ' :; case ' Q ' :;

case ' R ' :; case ' S ' :; case ' T ' :;

case ' U ' :; case ' V ' :; case ' W ' :;

case ' X ' :; case ' Y ' :; case ' Z ' :;

strstart = instr;

instr ++ ;

goto id_label;

case ' 0 ' :;

case ' 1 ' :; case ' 2 ' :; case ' 3 ' :;

case ' 4 ' :; case ' 5 ' :; case ' 6 ' :;

case ' 7 ' :; case ' 8 ' :; case ' 9 ' :;

strstart = instr;

instr ++ ;

goto num_label;

case ' { ' :

instr ++ ;

return ' { ' ;

case ' } ' :

instr ++ ;

return ' } ' ;

case ' ( ' :

instr ++ ;

return ' ( ' ;

case ' ) ' :

instr ++ ;

return ' ) ' ;

case ' + ' :

instr ++ ;

return ' + ' ;

case ' - ' :

instr ++ ;

return ' - ' ;

case ' * ' :

instr ++ ;

return ' * ' ;

case ' / ' :

if (instr[ 1 ] == ' / ' ) { // ‘ // ’形式的注释

instr += 2 ;

while ( * (instr) != 10 && * (instr) != 0 )

instr ++ ;

// instr++;

} else { // 除号'/'

instr ++ ;

return ' / ' ;

}

break ;

case ' = ' :

if (instr[ 1 ] == ' = ' ) {instr += 2 ; return EQUAL;}

else {instr ++ ; return ' = ' ;}

break ;

case ' < ' :

if (instr[ 1 ] == ' = ' ) {instr += 2 ; return LESSEQUAL;}

else {instr ++ ; return ' < ' ;}

break ;

case ' > ' :

if (instr[ 1 ] == ' = ' ) {instr += 2 ; return GREATEQUAL;}

else {instr ++ ; return ' > ' ;}

break ;

case ' ! ' :

if (instr[ 1 ] == ' = ' ) {instr += 2 ; return NOTEQUAL;}

else {instr ++ ; return ' ! ' ;}

break ;

case ' & ' :

if (instr[ 1 ] == ' & ' ) {instr += 2 ; return ARGAND;}

if (instr[ 1 ] == ' & ' && (isid(instr[ 2 ]) || isspace(instr[ 2 ]))) {instr ++ ; return ' & ' ;}

error(instr);

break ;

case ' | ' :

if (instr[ 1 ] == ' | ' ) {instr += 2 ; return ARGAND;}

if (instr[ 1 ] == ' | ' && (isid(instr[ 2 ]) || isspace(instr[ 2 ]))) {instr ++ ; return ' | ' ;}

error(instr);

break ;

case ' ; ' :

instr ++ ;

return ' ; ' ;

case ' ' :

// printf("newline(%d) ",getline());

nextline();

instr ++ ;

start_pos_of_current_line = ( int )instr;

break ;

default :

instr ++ ;

break ;

id_label:

while (isid( * instr))

instr ++ ;

strncpy(string,strstart,instr - strstart);

string[instr - strstart] = 0 ;

return ID;

num_label:

while (isdigit( * instr))

instr ++ ;

// if(isalpha(*(instr+1))

// error(instr);让语法分析来做吧~

strncpy(string,strstart,instr - strstart);

string[instr - strstart] = 0 ;

return NUM;

}

int main( int argc, char * argv[])

{

if (argc <= 1 || argc >= 4 ) {

printf( " Usage:>LexAly[g|s][filename] " );

exit( 0 );

}

if (argc == 3 ) {

argv[ 1 ][ 0 ] == ' g ' ? main_grammar(argv[ 2 ]):main_spell(argv[ 2 ]);

} else if (argc == 2 ) {

argv[ 1 ][ 0 ] == ' g ' ? main_grammar( " source2.txt " ):main_spell( " source2.txt " );

;

}

return 0 ;

}

// grammar.cpp:C子集语法分析程序

C语言子集，可支持

语句块，语句，条件语句，While循环语句，赋值语句，基本算术表达式等。例如：

{

//CommentSupported:ThisisonlyaTest^_^

X=12+b/13-5*4*9;//AAssignmentStatement

Y=4*1024;

if(X>=Y){

if(i==i){//ThisisnestedifStatement

X=Y;

}

;;;;//ThisisNullStatement

while(X<Y){//ThisiswhileStatement

X=X+1;

}

规则如下：

<StatementBlock>::='{'<StatementSequence>'}'

<StatementSequence>::={<NullStatement>|<CommonStatement>|<VariantStatement>}

<NullStatement>::=';'

<LoopWhileStatement>::=while(<Condition><StatementBlock>

<Expression>::=<Item>{+<Item>|-<Item>}

<Item>::=<Factor>{*<Factor>|/<Factor>}

<Factor>::=ID|NUM|(<Expression>)

<RelationOperator>::=<|<=|>=|>|==|!=

#include " stdafx.h "

#include < ctype.h >

#include < conio.h >

#include < stdlib.h >

#include < string.h >

#include " Constant.h "

extern int gettoken();

extern int getcurrentpos();

extern char * string;

extern int sym;

extern int current_line;

extern int start_pos_of_current_line;

extern char * getlinestring( int line, char * in_buf);

extern char input[];

// 非终结符的英文定义

void StatementBlock(); // 语句块

void StatementSequence(); // 语句串

// XxxxxStatement()为三类语句

void NullStatement(); // 空语句--仅仅含有一个;号

void CommonStatement(); // 语句

void VariantStatement(); // 变种语句--包括if(){},while{},他们都不以;结尾

// 下面的属于CommonStatement

void AssignmentStatement(); // 赋值语句

// 下面两种属于VariantStatement

void ConditionStatement(); // 条件语句

void LoopWhileStatement(); // while循环语句

void Condition(); // 条件

void Expression(); // 表达式

void Item(); // 项

void Factor(); // 因子

void RelationOperator(); // 关系运算符

注：以上未考虑函数调用表达式

void match_error( char * c)

{

char error_buf[ 1024 ];

int in_line_pos = getcurrentpos() - start_pos_of_current_line;

printf( " GrammarError! " );

printf( " Line%d[%d]:%sexpected " ,current_line,in_line_pos,c);

// 获取错误行并打印出来

getlinestring(current_line,error_buf);

printf( " %s " ,error_buf);

// 输出错误指示点(NotExact!)

for ( int i = 1 ;i < in_line_pos;i ++ )

printf( " %c " , ' ^ ' );

printf( " " );

exit( 0 );

}

// expecetedSym期望符号

// msg出错时给出的消息

void match( int expecetedSym, char * msg)

{

if (expecetedSym != sym) {

# if 0

if (sym < 0x20 )

printf( " Fail:ExpecetedSym=%dsym=%d " ,expecetedSym,sym);

else

printf( " Fail:ExpecetedSym=%dsym=%c " ,expecetedSym,sym);

#endif

match_error(msg);

}

gettoken(); // 预读一个符号

}

void grammar_check()

{

// printf("%s",input);

gettoken(); // 开始检查，填充预读区

// match('-',"DK");

StatementBlock();

if (sym != END)match(END, " EndOfFile " );

}

void StatementBlock() // 语句块

{

match( ' { ' , " { " ); // 和预读符号比较

StatementSequence();

match( ' } ' , " } " );

}

void StatementSequence() // 语句串

{

while (sym == ID ||

sym == IF ||

sym == WHILE ||

sym == ' ; ' )

{

while (sym == ID) // 也可以用if(),但从统计角度看,while效率会略高。

// 因为一般普通CommonStatement()出现概率较大

{

CommonStatement();

match( ' ; ' , " ; " );

}

while (sym == IF ||

sym == WHILE)

{

VariantStatement();

}

while (sym == ' ; ' )

{

NullStatement();

}

void VariantStatement() // 变种语句--包括if(){},while{},他们都不以;结尾

{

switch (sym) { // 若sym与下面两种均不匹配，什么也不做，空语句是也~

case IF:

ConditionStatement(); // 条件语句

break ;

case WHILE:

LoopWhileStatement();

break ;

}

return ;

}

void NullStatement() // 空语句--仅仅含有一个;号

{

match( ' ; ' , " ; " );

}

void CommonStatement() // 语句,以;结尾,但不以;开头

{

switch (sym) { // 若sym与下面任何一种均不匹配，什么也不做，空语句是也~

case ID:

AssignmentStatement();

break ;

}

return ;

}

void AssignmentStatement() // 赋值语句

{

match(ID, " ID " );

match( ' = ' , " = " );

Expression();

}

void ConditionStatement() // 条件语句

{

match(IF, " if " );

match( ' ( ' , " ( " );

Condition();

match( ' ) ' , " ) " );

StatementBlock();

}

void LoopWhileStatement() // 循环语句

{

match(WHILE, " while " );

match( ' ( ' , " ( " );

Condition();

match( ' ) ' , " ) " );

StatementBlock();

}

void Condition() // 条件

{

Expression();

RelationOperator();

Expression();

}

// <Expression>::=<Item>{+<Item>|-<Item>}

void Expression() // 表达式

{

Item();

while ( true ) { // {+<Item>|-<Item>}可以有多个

switch (sym) {

case ' + ' :

match( ' + ' , " + " );

Item();

break ;

case ' - ' :

match( ' - ' , " - " );

Item();

break ;

default :

return ;

}

return ;

}

// <Item>::=<Factor>{*<Factor>|/<Factor>}

void Item() // 项

{

Factor();

while ( true ) { // {*<Factor>|/<Factor>}--可以有多个

switch (sym) {

case ' * ' :

match( ' * ' , " * " );

Factor();

break ;

case ' / ' :

match( ' / ' , " / " );

Factor();

break ;

default :

return ;

}

return ;

}

// <Factor>::=ID|NUM|(<Expression>)

void Factor() // 因子

{

switch (sym) {

case ID:

match(ID, " ID " );

break ;

case NUM:

match(NUM, " NUM " );

break ;

case ' ( ' :

match( ' ( ' , " ( " );

Expression();

match( ' ) ' , " ) " );

break ;

default :

match(ID, " Afactor " ); // ID在这里肯定不match，利用它来报错（找不到因子）

break ;

}

// <RelationOperator>::=<|<=|>=|>|==|!=

void RelationOperator() // 关系运算符

{

switch (sym) {

case ' < ' :

match( ' < ' , " < " );

break ;

case ' > ' :

match( ' > ' , " > " );

break ;

case LESSEQUAL:

match(LESSEQUAL, " <= " );

break ;

case GREATEQUAL:

match(GREATEQUAL, " >= " );

break ;

case EQUAL:

match(EQUAL, " == " );

break ;

case NOTEQUAL:

match(NOTEQUAL, " != " );

break ;

}

核心：规则----

<StatementBlock> ::= '{'<StatementSequence>'}'
<StatementSequence> ::= {<NullStatement>|<CommonStatement>|<VariantStatement>}

<AssignmentStatement> ::= ID=<Expression>
<ConditionStatement> ::= if(<Condition> <StatementBlock>
<LoopWhileStatement> ::= while(<Condition> <StatementBlock>

分享到：