编译原理(一).词法分析

本文介绍了正则表达式

github.com/riscv/riscv-gcc

正则表达式

正则表达式是语法

正则语言是语义

正则表达式定义了一个正则语言，为集合

形如L(正则表达式)

eg.C语言标识符的正则表达式

1	^[a-zA-Z_](a-zA-Z\d)*$

^和$限定在一行中匹配

[a-zA-Z_]表示以下划线或大小写字母开头

(a-zA-Z\d)*表示开头跟着0个或多个字母或数字

eg.C语言单行注释

\/\/.

\/匹配/的字面值

.匹配除了换行外的所有

无需加^匹配开头

1	int a = 0; //这里的注释并非从开头写的也有效

eg.C语言多行注释

1	\/\[\s\S]?\*/\

\/\*\*\/是多行注释的字面形式

[\s\S]*?的作用是匹配任意字符

\s 表示空白字符（包括空格、制表符、换行符等）

\S 则表示非空白字符。

*表示匹配一次或多次

?用来尽可能减少匹配次数，遇到第一个*/停止

eg.三的倍数

1	(0\|(1(010)1))*

Flex代码编写

%{
    #include "stdio.h"
%}

手撸代码

要求

// PL/0 demo

(*
	This is a multi-line
	comment
*)

const limit=10;
var n, f, test, t1, t2;
begin
     n := 0;
     f := 1;
     while n # limit do
     begin
          n := n + 1;
          f := f * n;
     end;
     test := 1+2-3*4/(5-6)+-7;
	t1:=test*2;
	t2:=t1+test;
     call print;	// print all var
end.

预处理

实验要求：

1.合并空白符：把原始程序中相邻的空格、制表符、回车等空白符合并成一个空格，便于后续处理

2.消除注释：消除原始程序中的注释内容；PL/0语法中没有规定注释的格式，参照Pascal语言规定如下两种注释格式：

单行注释：“//”引导内容，与C++语言中单行注释一致。
多行注释：“(*”和“*)”之间内容，具体参见后面示例程序。

删除注释

代码如下

void removeComments(char *source, char *destination) {
    int i, j;
    int len = strlen(source);
    int inSingleLineComment = 0;
    int inMultiLineComment = 0;

    for (i = 0, j = 0; i < len; i++) {
        if (!inSingleLineComment && !inMultiLineComment) {
            if (source[i] == '/' && source[i + 1] == '/') {
                inSingleLineComment = 1;
                i++; // 跳过第二个斜杠
            } else if (source[i] == '(' && source[i + 1] == '*') {
                inMultiLineComment = 1;
                i++; // 跳过第二个星号
            } else {
                destination[j++] = source[i];
            }
        } else if (inSingleLineComment && source[i] == '\n') {
            inSingleLineComment = 0;
        } else if (inMultiLineComment && source[i] == '*' && source[i + 1] == ')') {
            inMultiLineComment = 0;
            i++; // 跳过第二个括号
        }
    }

    destination[j] = '\0'; // 在目标字符串末尾添加结束符
}

合并空白符

代码如下：

void mergeWhitespace(char *source, char *destination) {
    int i, j;
    int len = strlen(source);
    int isWhitespace = 0; // 用于标记前一个字符是否是空白符

    for (i = 0, j = 0; i < len; i++) {
        if (source[i] == ' ' || source[i] == '\t' || source[i] == '\n' || source[i] == '\r') {
            if (!isWhitespace) {
                destination[j++] = ' ';
                isWhitespace = 1;
            }
        } else {
            destination[j++] = source[i];
            isWhitespace = 0;
        }
    }

    destination[j] = '\0'; // 在目标字符串末尾添加结束符
}

词法分析

输入输出

计划采取命令行和直接无参输入文件名两种方式，先写个架子，架子

#include <stdio.h>
#include <string.h>
#define MAXFILENAME 100
#define BUFFERSIZE 1024

int inSingleLineComment = 0;
int inMultiLineComment = 0;

// 消除注释
void removeComments(char *source, char *destination) {
    int i, j;
    int len = strlen(source);

    for (i = 0, j = 0; i < len; i++) {
        if (!inSingleLineComment && !inMultiLineComment) {
            if (source[i] == '/' && source[i + 1] == '/') {
                inSingleLineComment = 1;
                i++; // 跳过第二个斜杠
            } else if (source[i] == '(' && source[i + 1] == '*') {
                inMultiLineComment = 1;
                i++; // 跳过第二个星号
            } else {
                destination[j++] = source[i];
            }
        } else if (inSingleLineComment && source[i] == '\n') {
            inSingleLineComment = 0;
        } else if (inMultiLineComment && source[i] == '*' && source[i + 1] == ')') {
            inMultiLineComment = 0;
            i++; // 跳过第二个括号
        }
    }

    destination[j] = '\0'; // 在目标字符串末尾添加结束符
}

// 合并空白符
void mergeWhitespace(char *source, char *destination) {
    int i, j;
    int len = strlen(source);
    int isWhitespace = 0; // 用于标记前一个字符是否是空白符

    for (i = 0, j = 0; i < len; i++) {
        if (source[i] == ' ' || source[i] == '\t' || source[i] == '\n' || source[i] == '\r') {
            if (!isWhitespace) {
                destination[j++] = ' ';
                isWhitespace = 1;
            }
        } else {
            destination[j++] = source[i];
            isWhitespace = 0;
        }
    }

    destination[j] = '\0'; // 在目标字符串末尾添加结束符
}


int main(int argc, char *argv[]) {
    //得到文件名
    FILE *file;
    FILE *preFile;
    char fileName[MAXFILENAME];
    char preFileName[MAXFILENAME+2];
    if(argc > 1) {
        strcpy(fileName, argv[1]);
    } else {
        printf("请输入要分析的文件名:\n");
        scanf("%99s",fileName);
    }
    //puts(fileName);
    //printf("%s",fileName);

    //打开获取的文件
    file = fopen(fileName,"rw");
    if(file == NULL) {
        printf("无法打开文件 %s\n",fileName);
        return 1;
    }

    //构造输出文件名
    char *lastDot = strrchr(fileName, '.');
    if(lastDot != NULL) {
        // 将最后一个点以及后面的内容替换为".i"
        sprintf(preFileName, "%.*s.i", (int)(lastDot - fileName), fileName);
    } else {
        //没找到直接追加".i"
        sprintf(preFileName, "%s.i", fileName);
    }
    //puts(preFileName);

    //打开输出文件
    preFile = fopen(preFileName,"w");
    //preFile =
    //fopen("11","w");灵异事件，不知道为什么执行过一遍这条语句后，所有文件均能以写入打开了！？
    if (preFile == NULL) {
        printf("无法创建输出文件 %s\n",preFileName);
        fclose(file);
        return 1;
    }
    //printf("成功创建了文件");

    //执行不断从文件中读入数据到内存中处理，预处理的主要步骤
    //主要进行注释删除以及空格合并
    //这里fgets函数是整行读取，与注释格式冲突，出现bug
    /*char inputBuffer[1024],outputBuffer[1024];
    int BUFFER_SIZE = 1024;
    while (fgets(inputBuffer, BUFFER_SIZE, file) != NULL) {
        removeComments(inputBuffer,outputBuffer);
        mergeWhitespace(outputBuffer,outputBuffer);
        fputs(outputBuffer, preFile);
    }*/

    char inputBuffer[BUFFERSIZE],outputBuffer[BUFFERSIZE];
    char ch;
    int index = 0;
    while (ch = fgetc(file)) {
        if (index == BUFFERSIZE) {
            index = 0;
            removeComments(inputBuffer,outputBuffer);
            mergeWhitespace(outputBuffer,outputBuffer);
            fputs(outputBuffer,preFile);
        } else if (ch == EOF){
            index = 0;
            removeComments(inputBuffer,outputBuffer);
            mergeWhitespace(outputBuffer,outputBuffer);
            fputs(outputBuffer,preFile);
            break;
        }else {
            inputBuffer[index++] = ch;
        }
    } 

    //关闭文件流
    fclose(file);
    fclose(preFile);
    //printf("预处理完成,处理结果已写入文件 %s\n",preFileName);

    return 0;
}

数据类型定义

// 定义单词种别的字符串表示数组
const char *categoryStrings[] = {
    "IDENTIFIER",    // 标识符
    "CONSTANT",      // 常数
    "OPERATOR_ADD",  // 加法运算符 '+'
    "OPERATOR_SUB",  // 减法运算符 '-'
    "OPERATOR_MUL",  // 乘法运算符 '*'
    "OPERATOR_DIV",  // 除法运算符 '/'
    "OPERATOR_EQ",   // 等号 '='
    "OPERATOR_NE",   // 不等号 '#'
    "OPERATOR_LT",   // 小于 '<'
    "OPERATOR_GT",   // 大于 '>'
    "OPERATOR_ASSIGN", // 赋值运算符 ':='
    "DELIMITER_LPAREN", // 左括号 '('
    "DELIMITER_RPAREN", // 右括号 ')'
    "DELIMITER_COMMA",  // 逗号 ','
    "DELIMITER_PERIOD", // 句号 '.'
    "DELIMITER_SEMICOLON", // 分号 ';'
    "KEYWORD_BEGIN",   // begin
    "KEYWORD_END",     // end
    "KEYWORD_IF",      // if
    "KEYWORD_THEN",    // then
    "KEYWORD_WHILE",   // while
    "KEYWORD_DO",      // do
    "KEYWORD_CONST",   // const
    "KEYWORD_VAR",     // var
    "KEYWORD_CALL",    // call
    "KEYWORD_PROCEDURE", // procedure
    "KEYWORD_ODD"      // odd
};

// 定义单词种别
typedef enum {
    IDENTIFIER,       // 标识符
    CONSTANT,         // 常数
    OPERATOR_ADD,     // 加法运算符 '+'
    OPERATOR_SUB,     // 减法运算符 '-'
    OPERATOR_MUL,     // 乘法运算符 '*'
    OPERATOR_DIV,     // 除法运算符 '/'
    OPERATOR_EQ,      // 等号 '='
    OPERATOR_NE,      // 不等号 '#'
    OPERATOR_LT,      // 小于 '<'
    OPERATOR_GT,      // 大于 '>'
    OPERATOR_ASSIGN,  // 赋值运算符 ':='
    DELIMITER_LPAREN, // 左括号 '('
    DELIMITER_RPAREN, // 右括号 ')'
    DELIMITER_COMMA,  // 逗号 ','
    DELIMITER_PERIOD, // 句号 '.'
    DELIMITER_SEMICOLON, // 分号 ';'
    KEYWORD_BEGIN,    // begin
    KEYWORD_END,      // end
    KEYWORD_IF,       // if
    KEYWORD_THEN,     // then
    KEYWORD_WHILE,    // while
    KEYWORD_DO,       // do
    KEYWORD_CONST,    // const
    KEYWORD_VAR,      // var
    KEYWORD_CALL,     // call
    KEYWORD_PROCEDURE,// procedure
    KEYWORD_ODD       // odd
} Category;

// 定义单词符号结构体
typedef struct {
    Category category;  // 单词的种别
    int attribute;      // 属性值，可以根据需要选择合适的数据类型
} Token;

// 定义打印单词符号的函数
/*void printToken(Token token) {
    printf("(%d, %d)\n", token.category, token.attribute);
}*/
// 定义打印单词符号的函数
void printToken(Token token) {
    printf("(%s, %d)\n", categoryStrings[token.category], token.attribute);
}

1.保留字提示改成关键字

2.最后为什么输出ERRO

3.以命令行参数的形式传入参数

4.各种输出显示仍然有点问题