Python实现SLR(1)语法分析器
实验课前一天晚上肝了个SLR语法分析器,当时还发朋友圈语法分析器和我晚上总得走一个,从第二天状态来看,应该是我们俩一起走了(笑
- 编写的时间比较仓促,所以代码有些地方实现不是很好,存在一些问题,以后有时间的话再来修补一下,比如在对两个项目规范族进行比较时效率比较低,first集和follow集中对连续多个非终结符推到ε的情况下可能会有bug,但在我的文法定义中特意绕开了ε,勉强能用。
- 为了方便代码阅读,加入了许多注释后的打印语句,将这些打印语句取消注释后运行,能够告诉你当前这段代码所做的事。
- 在编写过程中,尽量对模块进行封装,把逻辑和文法定义分开,做到文法可以方便地修改,但时间过于仓促可能还是有点小问题。当文法修改时,修改
getCol函数(该函数将终结符和非终结符映射到action和goto表中相应的列),initProduction函数(该函数定义了文法产生式(拓广文法),在本文中有28个产生式),source(输入单词序列),varset(非终结符集合),terminalset(终结符集合)
SLR(1)分析流程
- 输入文法
- 求first集
- 求follow集
- 构造LR(0)项目集DFA
- 构造Action和Goto
- 按照Action和Goto进行分析
1.主要数据结构定义和基础函数:
基础函数
-
isVariable函数判断是不是非终结符
-
isTerminal函数判断是不是终结
-
transf(production_set, var)函数 production_set为一个LR(0)项目,尝试通过var(终结符或非终结符)进行转移
-
isSameStatus(status1, status2)函数:判断status1和status2是不是两个相同的LR(0)项目
-
isInPointset(production_set, pointset):#用来检验production_set是不是已经存在的point ,如果存在就把point返回(生成DFA时用到)
数据结构
- 产生式采用类来存储,left和right分别为list,number‘为产生式编号
- GraphPoint存储DFA转移,transfer为有向边集合,集合中的一个元素由var(终结符或非终结符),和另一个GraphPoint组成
class Production:
def __init__(self, left, right, number):
self.left = left
self.right = right
self.number = number
class GraphPoint:
def __init__(self, begin_production, id):
self.status = begin_production
self.transfer = []
self.id = id
def add_transfer(self, var, graphPoint):
self.transfer.append([var, graphPoint])
2.文法定义
1.分析目标代码:int lexicalanalysis(){ float a; int b; a=1.1; b=2; while(b<100){ b=b+1; a=a+3;}; if(a>5) {b=b-1;} else {b=b+1;}}
2.语法分析器输入为目标代码的词法分析器输出的单词序列
source = [[5, "int", " 关键字"], [1, "lexicalanalysis", " 标识符"], [13, "(", " 左括号"], [14, ")", " 右括号"], [20, "{", " 左大括号"],
[4, "float", " 关键字"], [1, "a", " 标识符"], [15, ";", " 分号"], [5, "int", " 关键字"], [1, "b", " 标识符"],
[15, ";", " 分号"], [1, "a", " 标识符"], [12, "=", " 赋值号"], [3, "1.1", " 浮点数"], [15, ";", " 分号"], [1, "b", " 标识符"],
[12, "=", " 赋值号"], [2, "2", " 整数"], [15, ";", " 分号"], [8, "while", " 关键字"], [13, "(", " 左括号"],
[1, "b", " 标识符"], [17, "<", " 小于号"], [2, "100", " 整数"], [14, ")", " 右括号"], [20, "{", " 左大括号"],
[1, "b", " 标识符"], [12, "=", " 赋值号"], [1, "b", " 标识符"], [9, "+", " 加 号"], [2, "1", " 整数"], [15, ";", " 分号"],
[1, "a", " 标识符"], [12, "=", " 赋值号"], [1, "a", " 标识符"], [9, "+", " 加号"], [2, "3", " 整数"], [15, ";", " 分号"],
[21, "}", " 右大括号"], [15, ";", " 分号"], [6, "if", " 关键字"], [13, "(", " 左括号"], [1, "a", " 标识符"],
[16, ">", " 大于号"], [2, "5", " 整数"], [14, ")", " 右括号"], [20, "{", " 左大括号"], [1, "b", " 标识符"],
[12, "=", " 赋值号"], [1, "b", " 标识符"], [10, "-", " 减号"], [2, "1", " 整数"], [15, ";", " 分号"], [21, "}", " 右大括号"],
[7, "else", " 关键字"], [20, "{", " 左大括号"], [1, "b", " 标识符"], [12, "=", " 赋值号"], [1, "b", " 标识符"],
[9, "+", " 加号"], [2, "1", " 整数"], [15, ";", " 分号"], [21, "}", " 右大括号"], [21, "}", " 右大括号"]]
3.文法定义:拓广文法共有28个产生式,0号产生式为保证分析器只有一个接受状态,而拓广的产生式。
def initProduction():
production_list = []
production = Production(["A1"], ["A"], 0)
production_list.append(production)
production = Production(["A"], ["E", "I", "(", ")", "{", "D", "}"], 1)
production_list.append(production)
production = Production(["E"], ["int"], 2)
production_list.append(production)
production = Production(["E"], ["float"], 3)
production_list.append(production)
production = Production(["D"], ["D", ";", "B"], 4)
production_list.append(production)
production = Production(["B"], ["F"], 5)
production_list.append(production)
production = Production(["B"], ["G"], 6)
production_list.append(production)
production = Production(["B"], ["M"], 7)
production_list.append(production)
production = Production(["F"], ["E", "I"], 8)
production_list.append(production)
production = Production(["G"], ["I", "=", "P"], 9)
production_list.append(production)
production = Production(["P"], ["K"], 10)
production_list.append(production)
production = Production(["P"], ["K", "+", "P"], 11)
production_list.append(production)
production = Production(["P"], ["K", "-", "P"], 12)
production_list.append(production)
production = Production(["I"], ["id"], 13)
production_list.append(production)
production = Production(["K"], ["I"], 14)
production_list.append(production)
production = Production(["K"], ["number"], 15)
production_list.append(production)
production = Production(["K"], ["floating"], 16)
production_list.append(production)
production = Production(["M"], ["while", "(", "T", ")", "{", "D", ";", "}"], 18)
production_list.append(production)
production = Production(["N"], ["if", "(", "T", ")", "{", "D",";", "}", "else", "{", "D", ";","}"], 19)
production_list.append(production)
production = Production(["T"], ["K", "L", "K"], 20)
production_list.append(production)
production = Production(["L"], [">"], 21)
production_list.append(production)
production = Production(["L"], ["<"], 22)
production_list.append(production)
production = Production(["L"], [">="], 23)
production_list.append(production)
production = Production(["L"], ["<="], 24)
production_list.append(production)
production = Production(["L"], ["=="], 25)
production_list.append(production)
production = Production(["D"], ["B"], 26)
production_list.append(production)
production = Production(["B"], ["N"], 27)
production_list.append(production)
return production_list
3.求First集
根据此算法即可求解first集,第8,9步可以采用递归的方式进行求解。
def getFirst(production_list, varset, terminalset):
first_dic = {}
# 用来标记first集是否计算完毕,防止重复计算浪费时间
done = {}
for var in varset:
first_dic[var] = set()
done[var] = 0
# 所有终结符的first集是他自身
for var in terminalset:
first_dic[var] = {var}
done[var] = 1
# print("初始化后的done",done)
# print("初始化的first_dic",first_dic)
for var in varset:
if done[var] == 0:
# print("计算",var)
getFirstForVar(var, first_dic, varset, terminalset, done)
# print("计算完毕",var)
# print("此时的done", done)
# print("此时的first_dic", first_dic)
else:
pass
return first_dic
def getFirstForVar(var, first_dic, varset, terminalset, done):
# 已经推导过直接结束
if done[var] == 1:
# print("我已经推导过了吼")
return
# 对非终结符求first集合,先看右边第一个元素为终结符
for production in production_list:
if var in production.left:
if isTerminal(production.right[0], terminalset):
first_dic[var].add(production.right[0])
# 用null表示空字符
if production.right[0] == "null":
# print("出现右侧为空")
first_dic[var].add("null")
# 右边第一个元素为非终结符
for production in production_list:
if var in production.left:
if isVariable(production.right[0], varset):
if var == production.right[0]:
continue
if done[production.right[0]] == 0:
getFirstForVar(production.right[0], first_dic, varset, terminalset, done)
if "null" in first_dic[production.right[0]]:
first_dic[production.right[0]].remove("null")
first_dic[var] = first_dic[var] | first_dic[production.right[0]]
# print("将 ",production.right[0],"的集合 ",first_dic[production.right[0]],"并入",var,"的集合中",first_dic[var],"中","得到",)
if isVariable(production.right[0], varset) and len(production.right) > 1:
index = 1
count = 1
while isVariable(production.right[index], varset):
index = index + 1
count = count + 1
if index >= len(production.right):
break
i = 0
while i < count:
getFirstForVar(production.right[i], first_dic, varset, terminalset, done)
if "null" in first_dic[production.right[i]]:
getFirstForVar(production.right[i + 1], first_dic, varset, terminalset, done)
first_dic[var] = first_dic[var] | first_dic[production.right[i + 1]]
else:
break
i = i + 1
# 完成后置为1
done[var] = 1
4.求解follow集
通过使用非终结符的follow集,提高识别能力,是SLR(1)分析的核心思想。
只有当项目集包含 A→α· ,则action[i,x]= rj,x属于FOLLOW(A),j为产生式 A→α的编号,通过这种方式可以解决一部分的移进和归约冲突。
ps:代码中有坑,如果文法中出现了刁钻ε,比如几个非终结符连续推为空,会产生bug,时间原因以及我的文法定义中并不存在ε就没有解决这个问题。
def getFollow(varset, terminalset, first_dic, production_list):
follow_dic = {}
done = {}
for var in varset:
follow_dic[var] = set()
done[var] = 0
follow_dic["A1"].add("#")
# for var in terminalset:
# follow_dic[var]=set()
# done[var] = 0
for var in follow_dic:
getFollowForVar(var, varset, terminalset, first_dic, production_list, follow_dic, done)
return follow_dic
def getFollowForVar(var, varset, terminalset, first_dic, production_list, follow_dic, done):
if done[var] == 1:
return
for production in production_list:
if var in production.right:
##index这里在某些极端情况下有bug,比如多次出现var,index只会返回最左侧的
if production.right.index(var) != len(production.right) - 1:
follow_dic[var] = first_dic[production.right[production.right.index(var) + 1]] | follow_dic[var]
# 没有考虑右边有非终结符但是为null的情况
if production.right[len(production.right) - 1] == var:
if var != production.left[0]:
# print(var, "吸纳", production.left[0])
getFollowForVar(production.left[0], varset, terminalset, first_dic, production_list, follow_dic,
done)
follow_dic[var] = follow_dic[var] | follow_dic[production.left[0]]
done[var] = 1
5.构建LR(0)项目集DFA
1.首先先定义一个CLOSURE函数,它将会对集合中的产生式状态进行不断地扩充,并最终形成一个项目集闭包
def CLOSURE(varset, terminalset, production_set, production_list):
算法:
2.构建DFA。函数定义
def generatingGraph(begin_production_set, varset, terminalset, production_list):
我们首先使用0号产生式来形成初始LR(0)项目集,产生初始节点(即开头数据结构中的类),并将它放到一个集合中,每次从集合中取一个节点,来用 每一个var属于(V | T)尝试进行转移,转移成功后将这条有向边存入该节点地transfer中,每次转移后的项目集判断是否是新项目集,如果是新项目集,则将新项目集放入集合中,当集合为空算法停止。
# 生成状态转移图
def generatingGraph(begin_production_set, varset, terminalset, production_list):
global id
CLOSURE(varset, terminalset, begin_production_set, production_list)
beginPoint = GraphPoint(begin_production_set, id)
id = id + 1
# print("从这个状态开始!")
# print(beginPoint.id)
# for onepro in beginPoint.status:
# print(onepro.number, " ", onepro.left, "->", onepro.right, " ")
pointset = [beginPoint]
set = varset | terminalset
stack = [beginPoint]
while len(stack) != 0:
currentPoint = stack.pop()
######
# print("该点被弹出,进行转移!")
# print(currentPoint.id)
# for onepro in currentPoint.status:
# print(onepro.number, " ", onepro.left, "->", onepro.right, " ")
#####
for var in set:
# print("尝试用",var,"进行转移")
result = transf(currentPoint.status, var)
if len(result) == 0:
# print(var,"转移失败!")
continue
else:
# print(var,"可转移!")
# print("将使用result进行转移!")
# for onepro in result:
# print(onepro.number, " ", onepro.left, "->", onepro.right, " ")
# 求出转移后的闭包
CLOSURE(varset, terminalset, result, production_list)
nextpoint = isInPointset(result, pointset)
if nextpoint is None:
# print(var,"转移为新状态:")
# 新节点压入寻找栈和点集合中,旧节点不能压入
nextpoint = GraphPoint(result, id)
id = id + 1
pointset.append(nextpoint)
stack.append(nextpoint)
# print(nextpoint.id)
# for onepro in nextpoint.status:
# print(onepro.number, " ", onepro.left, "->", onepro.right, " ")
currentPoint.add_transfer(var, nextpoint)
# print("生成一个新状态")
# for onepro in result:
# print(onepro.number," ",onepro.left,"->",onepro.right," ")
return pointset
# 形成闭包
def CLOSURE(varset, terminalset, production_set=[], production_list=[]):
sizebefore = len(production_list)
sizeafter = -1
# 用来测试是不是已经形成闭包,避免进入死循环
flag = 0
for production_in_set in production_set:
if production_in_set.right.index(".") != len(production_in_set.right) - 1:
if isVariable(production_in_set.right[production_in_set.right.index(".") + 1], varset):
flag = 1
if flag == 0:
return
while sizeafter != sizebefore:
for production_in_set in production_set:
# 点在最右侧就不可能转移
if (production_in_set.right.index(".") == len(production_in_set.right) - 1):
continue
i = production_in_set.right.index(".") + 1;
# print(i," length",len(production_in_set.right))
if isTerminal(production_in_set.right[i], terminalset):
continue;
templist = []
for x in production_list:
# print(i,len(production_in_set.right))
if x.left[0] == production_in_set.right[i]:
y = copy.deepcopy(x)
y.right.insert(0, ".")
flag = 0
for one in production_set:
rightflag = 0;
if len(one.right) != len(y.right):
rightflag = 1
else:
for j in range(0, len(y.right)):
if one.right[j] != y.right[j]:
rightflag = 1
if one.left[0] == y.left[0] and rightflag == 0:
flag = 1
if flag == 0:
templist.append(y)
sizebefore = len(production_set)
production_set.extend(templist)
sizeafter = len(production_set)
6.构造Action和Goto表
算法:
算法中的(1)(2)思想类似于计算机网络中的洪泛控制,将初始节点放入一个集合中,从集合中取一个节点,从一个节点走出它的所有有向边,并将这个节点标记为已经走过,将到达所有的之前没有走过的节点放入集合中,如此以往,直到集合为空。代码中的一些打印出错的语句为检测是否存在冲突的语句,由于编写时间限制原因,大多数的冲突可被测出,但少部分冲突仍然不可见(天坑)。
算法(3)(4)通过遍历项目集中的产生式状态即可判断。
#Cell为Action中的一个元素,do表示动作,which表示数字,如转移的状态或采用归约的产生式序号,done为是否已经走过,类似于洪泛控制的作用
class Cell:
def __init__(self):
self.do = -1
self.which = -1
self.done = 0
def initActionAndGoto(pointset, varset, terminalset, begin, follow_dic):
Action = [[Cell() for i in range(len(terminalset))] for j in range(len(pointset))]
Goto = [[-1 for i in range(len(varset))] for j in range(len(pointset))]
for point in pointset:
# 转移状态
for tran in point.transfer:
if isVariable(tran[0], varset):
if Goto[point.id][getCol(tran[0])] != -1:
print("出错404")
Goto[point.id][getCol(tran[0])] = tran[1].id
else:
if Action[point.id][getCol(tran[0])].done == 1:
print("出错403")
Action[point.id][getCol(tran[0])].done = 1
Action[point.id][getCol(tran[0])].do = "S"
Action[point.id][getCol(tran[0])].which = tran[1].id
for production in point.status:
if production.right.index(".") == len(production.right) - 1 and production.left[0] == begin:
if Action[point.id][getCol("#")].done == 1:
print("出错415")
Action[point.id][getCol("#")].do = "acc"
Action[point.id][getCol("#")].done = 1
if production.right.index(".") == len(production.right) - 1 and production.left[0] != begin:
# 在follow集中才可归约
for terminal in terminalset:
if terminal in follow_dic[production.left[0]]:
# 冲突检测
if Action[point.id][getCol(terminal)].done == 1:
for xx in point.status:
print(xx.number, " ", xx.left, "->", xx.right)
print("Action表", point.id, "行", getCol(terminal), "列冲突")
print("原本", Action[point.id][getCol(terminal)].do, Action[point.id][getCol(terminal)].which)
print("现在", "R", production.number)
print("出错416")
Action[point.id][getCol(terminal)].do = "R"
Action[point.id][getCol(terminal)].done = 1
# 采用该产生式归约
Action[point.id][getCol(terminal)].which = production.number
return Action, Goto
7.根据Action和Goto进行语法分析
算法思想:
开始时句型前缀栈和状态站分别压入#和0状态。
循环:
如果表中为si,则将缓冲区第一个元素压入句型前缀栈,并将 i(状态)压入状态栈
如果表中为ri , 则采用第i个表达式进行归约,弹出的元素个数为i个表达式的右侧的元素个数,之后根据栈顶状态和归约得到的非终结符查看GOTO表,查找当前状态,并将当前状态和规约得到的非终结符分别入栈。
如果表中为error!,恭喜出错,去找bug吧(也有可能是你的输入不符合当前文法,文法冲突也会导致这种情况)。
如果表中为acc,恭喜成功。
# SLR分析开始
def SLR(Action, Goto, source, production_list):
source.append([0, "#", "结束符"])
statusstack = [0]
sentence_stack = ["#"]
print(source)
while 1:
print("*****************************************")
print("缓冲区剩余元素", source)
terminal = source.pop(0)
print("状态栈", statusstack)
print("句型栈", sentence_stack)
# 移进
if Action[statusstack[len(statusstack) - 1]][terminal[0]].do == "S":
print("动作: 移入操作,从缓冲区中读取",terminal[1],"元素进行移入,并根据Action压入",Action[statusstack[len(statusstack) - 1]][terminal[0]].which,"状态")
statusstack.append(Action[statusstack[len(statusstack) - 1]][terminal[0]].which)
sentence_stack.append(terminal[1])
elif Action[statusstack[len(statusstack) - 1]][terminal[0]].do == "R":
# 归约
# 记录归约产生式
r_production = 0
for production in production_list:
if production.number == Action[statusstack[len(statusstack) - 1]][terminal[0]].which:
r_production = production
for i in range(len(r_production.right)):
statusstack.pop()
sentence_stack.pop()
statusstack.append(Goto[statusstack[len(statusstack) - 1]][getCol(r_production.left[0])])
print("动作: 归约操作,根据Action表利用第",r_production.number,"个产生式归约")
sentence_stack.append(r_production.left[0])
source.insert(0, terminal)
elif Action[statusstack[len(statusstack) - 1]][terminal[0]].do == "acc":
print("!!!!!!!!!!语义分析完成!!!!!!!!!!!!!!")
break;
else:
print("error 462!");
8.运行与测试
source = [[5, "int", " 关键字"], [1, "lexicalanalysis", " 标识符"], [13, "(", " 左括号"], [14, ")", " 右括号"], [20, "{", " 左大括号"],
[4, "float", " 关键字"], [1, "a", " 标识符"], [15, ";", " 分号"], [5, "int", " 关键字"], [1, "b", " 标识符"],
[15, ";", " 分号"], [1, "a", " 标识符"], [12, "=", " 赋值号"], [3, "1.1", " 浮点数"], [15, ";", " 分号"], [1, "b", " 标识符"],
[12, "=", " 赋值号"], [2, "2", " 整数"], [15, ";", " 分号"], [8, "while", " 关键字"], [13, "(", " 左括号"],
[1, "b", " 标识符"], [17, "<", " 小于号"], [2, "100", " 整数"], [14, ")", " 右括号"], [20, "{", " 左大括号"],
[1, "b", " 标识符"], [12, "=", " 赋值号"], [1, "b", " 标识符"], [9, "+", " 加 号"], [2, "1", " 整数"], [15, ";", " 分号"],
[1, "a", " 标识符"], [12, "=", " 赋值号"], [1, "a", " 标识符"], [9, "+", " 加号"], [2, "3", " 整数"], [15, ";", " 分号"],
[21, "}", " 右大括号"], [15, ";", " 分号"], [6, "if", " 关键字"], [13, "(", " 左括号"], [1, "a", " 标识符"],
[16, ">", " 大于号"], [2, "5", " 整数"], [14, ")", " 右括号"], [20, "{", " 左大括号"], [1, "b", " 标识符"],
[12, "=", " 赋值号"], [1, "b", " 标识符"], [10, "-", " 减号"], [2, "1", " 整数"], [15, ";", " 分号"], [21, "}", " 右大括号"],
[7, "else", " 关键字"], [20, "{", " 左大括号"], [1, "b", " 标识符"], [12, "=", " 赋值号"], [1, "b", " 标识符"],
[9, "+", " 加号"], [2, "1", " 整数"], [15, ";", " 分号"], [21, "}", " 右大括号"], [21, "}", " 右大括号"]]
id = 0
varset = {"A1", "A", "E", "I", "D", "F", "G", "M", "P", "K", "T", "L", "B","N"}
terminalset = {"(", ")", "{", "}", ";", "int", "float", "number", "floating", "while", "if", "else", ">", "<", ">=",
"<=", "==", "=", "#", "+", "-", "id"}
production_list = initProduction()
first_dic = getFirst(production_list, varset, terminalset)
# for x in first_dic:
# print(x," : ",first_dic[x])
follow_dic = getFollow(varset, terminalset, first_dic, production_list)
# print("follow:")
# for x in follow_dic:
# print(x, ":", follow_dic[x])
production = Production(["A1"], [".", "A"], 0)
production_set = [production]
# for x in production_set:
# print(x.number," ",x.left,"->",x.right," ")
pointset = generatingGraph(production_set, varset, terminalset, production_list)
begin = "A1"
Action, Goto = initActionAndGoto(pointset, varset, terminalset, begin, follow_dic)
print("**********************GOTO***********************************")
for i in range(len(Goto)):
print(i, end=" ")
for j in range(len(Goto[i])):
print("%3d" % Goto[i][j], end=" ")
print("")
print("**********************Action***********************************")
for i in range(len(Action)):
print("%2d" % i, end=": ")
for j in range(len(Action[i])):
if (Action[i][j].done == 0):
print("error!", end=" ")
else:
print("%3s" % Action[i][j].do, "%2d" % Action[i][j].which, end=" ")
print("")
SLR(Action, Goto, source, production_list)
结果:
GOTO表(局部):(60*14)
Action表(局部):60*22
规约过程(局部):共142次
开始:
结束:
完整代码编译原理结课会放在博客里
2020.11.12