PHP的编译与执行笔记 - PHP的编译

PHP编译流程

词法,语法解析

example
$a=3+4-6先经过词法分析器分割成$a,=,3,+,4,-,6,将这些token转给语法分析器,生成语法分析树
语法分析树
re2c词法扫描器
yacc语法分析器
定义词法解析规则:

1
2
3
4
5
6
7
8
/*!re2c
LABEL [a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*
LNUM [0-9]+

//规则
"$"{LABEL} {return T_VAR;}
{LNUM} {return T_NUM;}
*/

定义语法解析规则:

1
2
3
4
5
6
7
8
//token定义
%token T_VAR
%token T_NUM

//语法规则
statement:
T_VAR '=' T_NUM '+' T_NUM {ret = str2int($3) + str2int($5);printf("%d",ret);}
;

配置递归规则:

1
2
3
4
5
6
7
8
//语法规则
statement:
T_VAR '=' expr {}
;
expr:
T_NUM {...}
|expr '?' T_NUM {}
;

抽象语法树(ast)

普通节点

非叶子节点,通常拿来做根节点,结构为zend_ast

1
2
3
4
5
6
7
typedef struct _zend_ast zend_ast;
struct _zend_ast {
zend_ast_kind kind; //节点类型/* Type of the node (ZEND_AST_* enum constant) */
zend_ast_attr attr; /* Additional attribute, use depending on node type */
uint32_t lineno; //行号/* Line number */
zend_ast *child[1]; //子节点/* Array of children (using struct hack) */
};

注意这里的child[1]并不意味着只有一个子节点,子节点数是根据kind类型限定的,利用struct hack作为可变长数组

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
enum _zend_ast_kind {
...

/* 0 child nodes */
ZEND_AST_MAGIC_CONST = 0 << ZEND_AST_NUM_CHILDREN_SHIFT,
ZEND_AST_TYPE,

/* 1 child node */
ZEND_AST_VAR = 1 << ZEND_AST_NUM_CHILDREN_SHIFT,
ZEND_AST_CONST,
...

/* 2 child nodes */
ZEND_AST_DIM = 2 << ZEND_AST_NUM_CHILDREN_SHIFT,
ZEND_AST_PROP,
...

/* 3 child nodes */
ZEND_AST_METHOD_CALL = 3 << ZEND_AST_NUM_CHILDREN_SHIFT,
ZEND_AST_STATIC_CALL,
ZEND_AST_CONDITIONAL,

ZEND_AST_TRY,
ZEND_AST_CATCH,
ZEND_AST_PARAM,
ZEND_AST_PROP_ELEM,

/* 4 child nodes */
ZEND_AST_FOR = 4 << ZEND_AST_NUM_CHILDREN_SHIFT,
ZEND_AST_FOREACH,
};

可以看到不同的类型的子结点数是固定的,最多4个结点

list节点

由多个节点具有相同类型的节点组成
比如use aa,bb,cc,导入多个命名空间
与zend_ast相比多了一个记录子节点数的成员children

1
2
3
4
5
6
7
typedef struct _zend_ast_list {
zend_ast_kind kind;
zend_ast_attr attr;
uint32_t lineno;
uint32_t children;
zend_ast *child[1];
} zend_ast_list;

其对应的类型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
enum _zend_ast_kind {
...

/* list nodes */
ZEND_AST_ARG_LIST = 1 << ZEND_AST_IS_LIST_SHIFT,
ZEND_AST_LIST,
ZEND_AST_ARRAY,
ZEND_AST_ENCAPS_LIST,
ZEND_AST_EXPR_LIST,
ZEND_AST_STMT_LIST,
ZEND_AST_IF,
ZEND_AST_SWITCH_LIST,
ZEND_AST_CATCH_LIST,
ZEND_AST_PARAM_LIST,
ZEND_AST_CLOSURE_USES,
ZEND_AST_PROP_DECL,
ZEND_AST_CONST_DECL,
ZEND_AST_CLASS_CONST_DECL,
ZEND_AST_NAME_LIST,
ZEND_AST_TRAIT_ADAPTATIONS,
ZEND_AST_USE,

...
};

其中有个比较特殊的类型
ZEND_AST_STMT_LIST
不代表任何语法,用来组织各个节点,相当于一个节点数组
CG(ast)的根节点就是这个类型

1
2
3
4
$a = 123;
$b = "hi~";

echo $a,$b;

数据节点

结构为zval_ast_zval,没有子节点,但多了一个在zval成员

1
2
3
4
5
enum _zend_ast_kind {
/* special nodes */
ZEND_AST_ZVAL = 1 << ZEND_AST_SPECIAL_SHIFT,
...
};
1
2
3
4
5
typedef struct _zend_ast_zval {
zend_ast_kind kind;
zend_ast_attr attr;
zval val;
} zend_ast_zval;

PHP词法,语法分析器的实现

词法规则文件定义在Zend/zend_language_scanner.l
语法规则文件定义在Zend/zend_language_parser.y
这两规则文件需要通过执行re2c,yacc生成对应的c语言代码
编译开始的入口函数为compile_file()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
ZEND_API zend_op_array *compile_file(zend_file_handle *file_handle, int type)
{
zend_lex_state original_lex_state;
zend_op_array *op_array = NULL;
zend_save_lexical_state(&original_lex_state);
//打开PHP脚本文件
if (open_file_for_scanning(file_handle)==FAILURE) {
...
} else {
...
CG(ast) = NULL;
CG(ast_arena) = zend_arena_create(1024 * 32);
if (!zendparse()) {
...
}

zend_ast_destroy(CG(ast));
zend_arena_destroy(CG(ast_arena));
...
}

zend_restore_lexical_state(&original_lex_state);
return op_array;
}
  1. 打开PHP脚本文件,调用zendparse()完成语法分析
  2. zendparse()中将不停调用zendlex()切割token,匹配语法,生成ast
  3. zend_ast_create()就是生成语法树结构的操作

语义值(token值)

比如$a=123; a和123就是语义值,通过zval存储
zval在zendlex()中分配,然后将其地址做为参数传给lex_scan()
进行token扫描,当匹配到某个token时,将其保存到该地址

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#define yylex zendlex
//zend_compile.c
int zendlex(zend_parser_stack_elem *elem) /* {{{ */
{
zval zv;
int retval;
...
again:
ZVAL_UNDEF(&zv);
//进行词法扫描,将zval地址传入
retval = lex_scan(&zv);
if (EG(exception)) {
//语法错误
return T_ERROR;
}
...
if (Z_TYPE(zv) != IS_UNDEF) {
//如果在分割token中有zval生成,则复制到zend_ast_zval结构中
elem->ast = zend_ast_create_zval(&zv);
}

return retval;
}

根进lex_scan()
PHP中解析变量的规则$var_name其词法规则为:

1
2
3
4
5
6
7
8
9
10
11
//zend_language_scanner.l
int lex_scan(zval * zendlval)
{
...
<ST_IN_SCRIPTING,ST_DOUBLE_QUOTES,ST_HEREDOC,ST_BACKQUOTE,ST_VAR_OFFSET>"$"{LABEL} {
//将匹配到的token值放到zvalzhong
//不保留$,所以yytext+1
zend_copy_value(zendlval, (yytext+1), (yyleng-1));
RETURN_TOKEN(T_VARIABLE);
}
}

语义值类型

语法分析器传给词法分析器一个zend_parser_stack_elem地址
词法分析器将解析出来的token值保存这个地址中

1
2
3
4
5
typedef union _zend_parser_stack_elem {
zend_ast *ast;
zend_string *str;
zend_ulong num;
} zend_parser_stack_elem;

在语法解析规则中,可以通过<ast/str/num>指定token后type使用哪种类型

1
2
3
4
5
6
7
8
%token <ast> T_LNUMBER   "integer number (T_LNUMBER)"
%token <ast> T_DNUMBER "floating-point number (T_DNUMBER)"
%token <ast> T_STRING "identifier (T_STRING)"
%token <ast> T_VARIABLE "variable (T_VARIABLE)"

%type <ast> top_statement namespace_name name statement function_declaration_statement
%type <ast> class_declaration_statement trait_declaration_statement
%type <ast> interface_declaration_statement interface_extends_list

抽象语法树编译

编译的过程主要在完成zendparse()处理之后

1
2
3
4
5
6
7
8
9
10
11
12
if (!zendparse()) {
...
zend_op_array *original_active_op_array = CG(active_op_array);
//分配zend_op_array内存
op_array = emalloc(sizeof(zend_op_array));
//初始化zend_op_array
init_op_array(op_array, ZEND_USER_FUNCTION, INITIAL_OP_ARRAY_SIZE);
CG(active_op_array) = op_array;
...
zend_compile_top_stmt(CG(ast));
...
}

zend_op_array初始化成功后从CG(ast)开始编译

1
2
//将抽象语法树编译为opline命令
zend_compile_top_stmt(CG(ast));

跟进

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
void zend_compile_top_stmt(zend_ast *ast) /* {{{ */
{
if (!ast) {
return;
}

if (ast->kind == ZEND_AST_STMT_LIST) {
//将zend_ast转成zend_ast_list
zend_ast_list *list = zend_ast_get_list(ast);
uint32_t i;
for (i = 0; i < list->children; ++i) {
//递归编译
zend_compile_top_stmt(list->child[i]);
}
return;
}
//非ZEND_AST_STMT_LIST编译
zend_compile_stmt(ast);
...
}

赋值语句的编译

ZEND_AST_ASSIGN根据ZEND_AST_KIND可以发现是两个节点
第一个用于保存变量名称
第二个用于保存变量值表达式

跟进zend_compile_stmt(ast)找到ZEND_AST_ASSIGN

1
2
3
4
5
6
7
8
9
10
11
12
13
14
void zend_compile_stmt(zend_ast *ast) /* {{{ */
{
...
switch (ast->kind) {
...
default:
{
znode result;
zend_compile_expr(&result, ast);
zend_do_free(&result);
}
}
...
}

继续跟进zend_compile_expr(&result, ast);

1
2
3
4
5
6
7
8
9
10
11
12
13
void zend_compile_expr(znode *result, zend_ast *ast) /* {{{ */
{
/* CG(zend_lineno) = ast->lineno; */
CG(zend_lineno) = zend_ast_get_lineno(ast);

switch (ast->kind) {
...
case ZEND_AST_ASSIGN:
zend_compile_assign(result, ast);
return;
...
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
void zend_compile_assign(znode *result, zend_ast *ast) /* {{{ */
{
zend_ast *var_ast = ast->child[0];//变量名
zend_ast *expr_ast = ast->child[1];//变量表达式

znode var_node, expr_node;
zend_op *opline;
uint32_t offset;
...
switch (var_ast->kind) {
case ZEND_AST_VAR:
case ZEND_AST_STATIC_PROP:
offset = zend_delayed_compile_begin();
//生成变量名的znode,这个结构只在此处临时使用,所以直接分配在stack上
zend_delayed_compile_var(&var_node, var_ast, BP_VAR_W);
//递归编译表达式,最终需要一个zval节点
zend_compile_expr(&expr_node, expr_ast);
zend_delayed_compile_end(offset);
//生成一条op
zend_emit_op(result, ZEND_ASSIGN, &var_node, &expr_node);
return;
...
}
}

根据事例

1
2
$a = 123;
$b = "hi~";

可以得到以下图

通过zend_try_compile_cv()生成znode

1
2
3
4
5
6
7
8
9
static int zend_try_compile_cv(znode *result, zend_ast *ast){
zend_ast *name_ast = ast->child[0];
if (name_ast->kind == ZEND_AST_ZVAL) {
zend_string *name = zval_get_string(zend_ast_get_zval(name_ast));
...
result->op_type = IS_CV;
result->u.op.var = lookup_cv(CG(active_op_array), name);
...
}

lookup_cv()就是生成操作数的过程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
static int lookup_cv(zend_op_array *op_array, zend_string* name) {
int i = 0;
zend_ulong hash_value = zend_string_hash_val(name);
//遍历op_array.vars检查此变量是否存在
while (i < op_array->last_var) {
if (ZSTR_VAL(op_array->vars[i]) == ZSTR_VAL(name) ||
(ZSTR_H(op_array->vars[i]) == hash_value &&
ZSTR_LEN(op_array->vars[i]) == ZSTR_LEN(name) &&
memcmp(ZSTR_VAL(op_array->vars[i]), ZSTR_VAL(name), ZSTR_LEN(name)) == 0)) {
zend_string_release(name);
return (int)(zend_intptr_t)ZEND_CALL_VAR_NUM(NULL, i);
}
i++;
}
//这是一个新变量
i = op_array->last_var;
op_array->last_var++;
if (op_array->last_var > CG(context).vars_size) {
CG(context).vars_size += 16; /* FIXME */
op_array->vars = erealloc(op_array->vars, CG(context).vars_size * sizeof(zend_string*));//扩容vars
}

op_array->vars[i] = zend_new_interned_string(name);
//传NULL时返回的是96+i*sizeof(zval)
return (int)(zend_intptr_t)ZEND_CALL_VAR_NUM(NULL, i);
}

其中96根据zend_execute_data的大小确定

生成opline

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
static zend_op *zend_emit_op(znode *result, zend_uchar opcode, znode *op1, znode *op2) /* {{{ */
{
//分配opline,设置opcode
zend_op *opline = get_next_op(CG(active_op_array));
opline->opcode = opcode;
//设置op1,op2
if (op1 == NULL) {
SET_UNUSED(opline->op1);
} else {
SET_NODE(opline->op1, op1);
}

if (op2 == NULL) {
SET_UNUSED(opline->op2);
} else {
SET_NODE(opline->op2, op2);
}
//设置返回值操作数
if (result) {
zend_make_var_result(result, opline);
}
return opline;
}

总流程图

pass_two()

上述步骤结束后,最后还会通过zend_emit_final_return()生成一条ZEND_RETURN指令为结束指令,在此之后pass_two()会对一些特殊的opcode进行处理,pass_two()还会把VAR/TMP_VAR/CONST操作由递增编号转为内存偏移值

1
2
3
4
5
6
7
if (!zendparse()) {
...
zend_compile_top_stmt(CG(ast));
zend_emit_final_return(&retval_zv);
...
pass_two(op_array);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
ZEND_API int pass_two(zend_op_array *op_array)
{
zend_op *opline, *end;
...
opline = op_array->opcodes;
end = opline + op_array->last;
//遍历全部opline
while (opline < end){
...
if (opline->op1_type == IS_CONST) {
ZEND_PASS_TWO_UPDATE_CONSTANT(op_array, opline->op1);
} else if (opline->op1_type & (IS_VAR|IS_TMP_VAR)) {
opline->op1.var = (uint32_t)(zend_intptr_t)ZEND_CALL_VAR_NUM(NULL, op_array->last_var + opline->op1.var);
}
if (opline->op2_type == IS_CONST) {
ZEND_PASS_TWO_UPDATE_CONSTANT(op_array, opline->op2);
} else if (opline->op2_type & (IS_VAR|IS_TMP_VAR)) {
opline->op2.var = (uint32_t)(zend_intptr_t)ZEND_CALL_VAR_NUM(NULL, op_array->last_var + opline->op2.var);
}
if (opline->result_type & (IS_VAR|IS_TMP_VAR)) {
opline->result.var = (uint32_t)(zend_intptr_t)ZEND_CALL_VAR_NUM(NULL, op_array->last_var + opline->result.var);
}
//设置opcode的处理handler
ZEND_VM_SET_OPCODE_HANDLER(opline);
opline++;
}
//解析当前op_array已经执行过pass_two()
op_array->fn_flags |= ZEND_ACC_DONE_PASS_TWO;
return 0;
}