准备调试环境

目前 CPython 的开发已经迁移到了 Github 上,可以直接去 Github clone 对应的分支。 我们将基于 Python 2.7.13 版本, Linux x86_64 环境进行接下来的工作。 下载好代码以后以

./configure --with-pydebug

make -j2

编译。

调试可以直接使用 GDB, 然后使用 Emacs + Ctags 看代码。(喜欢使用 IDE 的话可以考虑 eclipse-cdt)

寻找参考资料

官方的 Python Developer’s Guide 是开始时必备的资料。 readthedocs 上另外一个版本的 Python Developer’s Guide 更详细一点。 Exploring CPython’s Internals 一节列出了 CPython 的目录结构, 以及推荐了几篇很有参考价值的文章。

可以参考 devguide 的 compiler 一节来调试 Python 解释器, 跟着执行流程一步步看代码。

另外 Philip Guo 的博客 上关于 Python 的博文也十分有价值。

中文资料中 《Python 源码剖析》 一定不能错过。

从 main 开始

在 gdb 中直接 b main,输入 list 可以看到对应 main 函数的代码为

// Modules/python.c
#include "Python.h"

int
main(int argc, char **argv)
{
	return Py_Main(argc, argv);
}

在编辑器中打开对应文件,在 Python 2.7 中位于 Modules/python.c, 该文件只是一个入口,直接调用了 Py_MainPy_Main 位于 Modules/main.c 中, 该函数的主要作用如下:

  • 初始化环境变量和命令行参数
  • 如果参数里有 -R 则调用 _PyRandom_Init 初始化 Hash 算法的随机数生成,使得 dict 对象中 key 的顺序每次启动时随机。
  • 调用 Py_Initialize 初始化 Python 解释器
  • 根据命令行选项决定是运行模块 -m RunModule, 还是运行一条语句 -c PyRun_SimpleStringFlags, 或是运行一个文件PyRun_AnyFileExFlags

我们先跟一下 PyRun_SimpleStringFlags, 该函数添加了一个默认的 __main__ 后直接调用了 PyRun_StringFlags

截取 PyRun_StringFlags 函数部分代码如下

PyParser_ASTFromString 对输入的源码进行词法语法分析生成 AST,run_mod生成字节码并运行。

// Python/pythonrun.c PyRun_StringFlags
PyObject *
PyRun_StringFlags(const char *str, int start, PyObject *globals,
                  PyObject *locals, PyCompilerFlags *flags)
{
    PyObject *ret = NULL;
    mod_ty mod;
    PyArena *arena = PyArena_New(); /* 为编译阶段申请一块内存池 */
    if (arena == NULL)
        return NULL;

    mod = PyParser_ASTFromString(str, "<string>", start, flags, arena); /* 将语句解析为 AST */
    if (mod != NULL)
        ret = run_mod(mod, "<string>", globals, locals, flags, arena); /* 调用 run_mod */
    PyArena_Free(arena);
    return ret;
}

词法语法分析

PyParser_ASTFromString 先将源码解析为 Parser Tree, 再调用 PyAST_FromNode,将 Parser Tree 转换为 AST。

// Python/pythonrun.c 54 行 引用了 graminit 中的 grammar 结构,该结构将在下面几个函数中以 `g` 变量名传递 
extern grammar _PyParser_Grammar; /* From graminit.c */ 
// Python/pythonrun.c PyParser_ASTFromString
mod_ty
PyParser_ASTFromString(const char *s, const char *filename, int start,
                       PyCompilerFlags *flags, PyArena *arena)
{
    mod_ty mod;
    PyCompilerFlags localflags;
    perrdetail err;
    int iflags = PARSER_FLAGS(flags);

    // 将源码解析为了 Parser Tree
    node *n = PyParser_ParseStringFlagsFilenameEx(s, filename,
                                    &_PyParser_Grammar, start, &err,
                                    &iflags);
    if (flags == NULL) {
        localflags.cf_flags = 0;
        flags = &localflags;
    }
    if (n) {
        flags->cf_flags |= iflags & PyCF_MASK;
        // 将 Parser Tree 转换为 AST
        mod = PyAST_FromNode(n, flags, filename, arena);
        PyNode_Free(n);
        return mod; // 返回 AST
    }
    else {
        err_input(&err);
        return NULL;
    }
}

生成 TOKEN

// Parser/parsetok.c PyParser_ParseStringFlagsFilenameEx
node *
PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename,
                          grammar *g, int start,
                          perrdetail *err_ret, int *flags)
{
    struct tok_state *tok;

    initerr(err_ret, filename);
    // 初始化词法分析模块
    if ((tok = PyTokenizer_FromString(s, start == file_input)) == NULL) {
        err_ret->error = PyErr_Occurred() ? E_DECODE : E_NOMEM;
        return NULL;
    }

    tok->filename = filename ? filename : "<string>";
    if (Py_TabcheckFlag || Py_VerboseFlag) {
        tok->altwarning = (tok->filename != NULL);
        if (Py_TabcheckFlag >= 2)
            tok->alterror++;
    }
    // 调用 parsetok ,将源码转换为 token 序列
    return parsetok(tok, g, start, err_ret, flags);

parsetok 循环直到读完整个字符串,将分出来的 token 添加到 parser 中, 最后返回 parser tree token 的 type 定义在 Include/token.h

// Parser/parsetok.c parsetok
static node *
parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
         int *flags)
{
    // 初始化 parser 
    if ((ps = PyParser_New(g, start)) == NULL) {
    ....
    // 获取 token type
    type = PyTokenizer_Get(tok, &a, &b);
    ....
    // 将 token 赋值给 str
    len = b - a; /* XXX this may compute NULL - NULL */
    str = (char *) PyObject_MALLOC(len + 1);
    if (len > 0)
        strncpy(str, a, len);
    str[len] = '\0';
    ....
    // 将 token 添加到 parser tree 中
    if ((err_ret->error =
             PyParser_AddToken(ps, (int)type, str, tok->lineno, col_offset,
                               &(err_ret->expected))) != E_OK) {
            if (err_ret->error != E_DONE) {
                PyObject_FREE(str);
                err_ret->token = type;
            }
            break;
        }
    ....
    if (err_ret->error == E_DONE) {
        n = ps->p_tree;
        ps->p_tree = NULL;
    }
    else
        n = NULL;
    ....
    done:
        PyTokenizer_Free(tok); // 释放词法分析器
    return n; // 返回 parser->p_tree
}

a = 1 为例,循环5次得到的 str 和 type 依次为:

  • str: a, type: 1(NAME)
  • str: =, type: 22(EQUAL)
  • str: 1, type: 2(NUMBER)
  • str: \n, type: 4(NEWLINE)
  • str: 未初始化, type: 0(ENDMARKER)

node 的结构定义在 /Include/node.h

typedef struct _node {
    short		n_type;
    char		*n_str;
    int			n_lineno;
    int			n_col_offset;
    int			n_nchildren;
    struct _node	*n_child;
} node;

Add TOKEN to Parser Tree

使用 export PYTHONDEBUG=1 开启 python 的调试模式,可以看到 python 将 TOKEN 解析为 Parser Tree 的过程。

./Python-2.7.12/python -d -c 'a = 1'
Token NAME/'a' ... It's a token we know
....
 DFA 'atom', state 0: Shift.
  DFA 'atom', state 5: Direct pop.
Token EQUAL/'=' ... It's a token we know
....
 DFA 'expr_stmt', state 1: Shift.
Token NUMBER/'1' ... It's a token we know
....
  DFA 'atom', state 5: Direct pop.
Token NEWLINE/'' ... It's a token we know
....
  DFA 'simple_stmt', state 3: Direct pop.
  DFA 'stmt', state 1: Direct pop.
Token NEWLINE/'' ... It's a token we know
 DFA 'file_input', state 0: Shift.
Token ENDMARKER/'' ... It's a token we know
 DFA 'file_input', state 0: Shift.
  DFA 'file_input', state 1: Direct pop.
  ACCEPT.
[35150 refs]

可以看到只有正确解析的 TOKEN 才会显示 "Direct pop

生成 TOKEN 后,按照语法规则生成 Parser Tree

// Parser/parser.c PyParser_AddToken
int
PyParser_AddToken(register parser_state *ps, register int type, char *str,
                  int lineno, int col_offset, int *expected_ret)
{
    ...
    /* Loop until the token is shifted or an error occurred */
    for (;;) {
        ...
        /* Pop while we are in an accept-only state */
                while (s = &d->d_state
                                [ps->p_stack.s_top->s_state],
                    s->s_accept && s->s_narcs == 1) {
                    D(printf("  DFA '%s', state %d: "
                             "Direct pop.\n",
                             d->d_name,
                             ps->p_stack.s_top->s_state));
        ...
    }
}

所以上面 Tree 的构建顺序为 atom expr_stmt atom simple_stmt stmt file_input

Parser Tree to AST

在上面的 PyParser_ASTFromString 函数中可以看到,在调用 PyParser_ParseStringFlagsFilenameEx 生成 Parser Tree ( concrete syntax tree ) 后,调用 PyAST_FromNode 生成 AST。

// ast.c PyAST_FromNode
mod_ty
PyAST_FromNode(const node *n, PyCompilerFlags *flags, const char *filename,
               PyArena *arena)
{
    ...
    stmts = asdl_seq_new(num_stmts(n), arena); /* 为 AST 申请内存 */
    if (!stmts)
        return NULL;
    for (i = 0; i < NCH(n) - 1; i++) {
        ch = CHILD(n, i);
        if (TYPE(ch) == NEWLINE)
            continue;
        REQ(ch, stmt);
        num = num_stmts(ch);
        if (num == 1) {
            s = ast_for_stmt(&c, ch); /* 生成对应的 AST Node */
            if (!s)
                goto error;
            asdl_seq_SET(stmts, k++, s); /* 将对应的 AST Node 设置到 asdl_seq 中对应的位置 */
        }
        ...
        return Module(stmts, arena); /* 转换 asdl_seq 为 mod_ty, mod_ty 事实上只是一个对
                                        asdl_seq 的简单封装,给 asdl_seq 加上了类型而已 */
    }
    ...
}

asdl_seq 结构定义在 Include/asdl.h 中,其实就是一个放 AST Node 的数组,根据注释可以看出,这个文件是由 Parser/asdl_c.py 生成的。

/* It would be nice if the code generated by asdl_c.py was completely
   independent of Python, but it is a goal the requires too much work
   at this stage.  So, for example, I'll represent identifiers as
   interned Python strings.
*/

/* XXX A sequence should be typed so that its use can be typechecked. */

typedef struct {
    int size;
    void *elements[1];
} asdl_seq;

asdl_c.pyParser/Python.asdl 中定义的 asdl 生成 C 代码,对应的 asdl 的实现在 Parser/asdl.py 中。

ast_for_xxx 是生成对应 xxx AST Node 的函数, ast_for_stmt 就是生成 stmt node (stmt_ty 由 asdl_c.py 生成)。

ast_for_stmt 其实就是一个巨大的 switch ,根据语法规则调用不同的 ast_for_xxx, 例如 a = 1,就会调用到 ast_for_expr

a = 1 生成 AST 的 callstack 如下

  • ast_for_stmt
  • ast_for_expr_stmt
  • ast_for_testlist
  • ast_for_expr

最后返回的 AST 可以由下面的 Python 代码查看

import ast
t = ast.parse('a = 1')
ast.dump(t)

>>> "Module(body=[Assign(targets=[Name(id='a', ctx=Store())], value=Num(n=1))])"

AST to CodeObject

上面分析到 PyParser_ASTFromString 将源码解析为 AST (mod_ty),调用 run_mod 函数。

run_mod 中有两个主要的函数:

  • PyAST_Compile,
  • PyEval_EvalCode

PyAST_Compile 将 AST 编译成 CodeObject,PyEval_EvalCode 运行编译后的 CodeObject。

// Python/pythonrun.c run_mod
static PyObject *
run_mod(mod_ty mod, const char *filename, PyObject *globals, PyObject *locals,
         PyCompilerFlags *flags, PyArena *arena)
{
    PyCodeObject *co;
    PyObject *v;
    co = PyAST_Compile(mod, filename, flags, arena);
    if (co == NULL)
        return NULL;
    v = PyEval_EvalCode(co, globals, locals);
    Py_DECREF(co);
    return v;
}

我们先看一下编译的过程

Build Symbol Table

PyAST_Compile 里主要使用 PySymtable_Build, compiler_mod 两个函数

// Python/compile.c PyAST_Compile
PyCodeObject *
PyAST_Compile(mod_ty mod, const char *filename, PyCompilerFlags *flags,
              PyArena *arena)
{
    struct compiler c; // 初始化 compiler 结构
    PyCodeObject *co = NULL;
    PyCompilerFlags local_flags;
    ...

    if (!compiler_init(&c))
        return NULL;
    c.c_filename = filename;
    c.c_arena = arena;
    c.c_future = PyFuture_FromAST(mod, filename);
    ...

    c.c_st = PySymtable_Build(mod, filename, c.c_future);

    co = compiler_mod(&c, mod);

 finally:
    compiler_free(&c);
    assert(co || PyErr_Occurred());
    return co;
}

PySymtable_Build 遍历 AST 构建 Symbol table,存储一些类名、变量名之类的信息。

struct symtable {
    const char *st_filename; /* name of file being compiled */
    struct _symtable_entry *st_cur; /* current symbol table entry */
    struct _symtable_entry *st_top; /* module entry */
    PyObject *st_symbols;    /* dictionary of symbol table entries */
    PyObject *st_stack;      /* stack of namespace info */
    PyObject *st_global;     /* borrowed ref to MODULE in st_symbols */
    int st_nblocks;          /* number of blocks */
    PyObject *st_private;        /* name of current class or NULL */
    PyFutureFeatures *st_future; /* module's future features */
};

typedef struct _symtable_entry {
    PyObject_HEAD
    PyObject *ste_id;        /* int: key in st_symbols */
    PyObject *ste_symbols;   /* dict: name to flags */
    PyObject *ste_name;      /* string: name of block */
    PyObject *ste_varnames;  /* list of variable names */
    PyObject *ste_children;  /* list of child ids */
    _Py_block_ty ste_type;   /* module, class, or function */
    int ste_unoptimized;     /* false if namespace is optimized */
    int ste_nested;      /* true if block is nested */
    unsigned ste_free : 1;        /* true if block has free variables */
    unsigned ste_child_free : 1;  /* true if a child block has free vars,
                                     including free refs to globals */
    unsigned ste_generator : 1;   /* true if namespace is a generator */
    unsigned ste_varargs : 1;     /* true if block has varargs */
    unsigned ste_varkeywords : 1; /* true if block has varkeywords */
    unsigned ste_returns_value : 1;  /* true if namespace uses return with
                                        an argument */
    int ste_lineno;          /* first line of block */
    int ste_opt_lineno;      /* lineno of last exec or import * */
    int ste_tmpname;         /* counter for listcomp temp vars */
    struct symtable *ste_table;
} PySTEntryObject;

Build CFG

AST 传入 compiler_mod 后会调用 compiler_XXX 生成 CFG。 然后 assemble 会从 CFG 生成最后的 bytecode。

// Python/compiler.c compiler_mod 
static PyCodeObject *
compiler_mod(struct compiler *c, mod_ty mod)
{
    PyCodeObject *co;
    int addNone = 1;
    ...
    if (!compiler_enter_scope(c, module, mod, 0))
        return NULL;
    switch (mod->kind) {
    case Module_kind:
        if (!compiler_body(c, mod->v.Module.body)) {
            compiler_exit_scope(c);
            return 0;
        }
        break;
    ...
    }
    co = assemble(c, addNone);
    compiler_exit_scope(c);
    return co;
}

根据上面 AST 生成一节, a = 1 的 AST 为 Module(body=[Assign(targets=[Name(id='a', ctx=Store())], value=Num(n=1))]),所以将会执行 compiler_body

// Python/compiler.c compiler_body
static int
compiler_body(struct compiler *c, asdl_seq *stmts)
{
    ...
    for (; i < asdl_seq_LEN(stmts); i++)
        VISIT(c, stmt, (stmt_ty)asdl_seq_GET(stmts, i));
    ...
}

VISIT 是定义在 compiler.c 中的一个宏,根据 AST Node 的 Type,调用不同的 compiler_visit_xxx 函数。

#define VISIT(C, TYPE, V) {\
    if (!compiler_visit_ ## TYPE((C), (V))) \
        return 0; \
}

这里 VISIT 宏展开是 compiler_visit_stmt

// Python/compiler.c compiler_visit_stmt
static int
compiler_visit_stmt(struct compiler *c, stmt_ty s)
{
    ...
    switch (s->kind) {
        case Assign_kind:
            n = asdl_seq_LEN(s->v.Assign.targets);
            VISIT(c, expr, s->v.Assign.value);
            for (i = 0; i < n; i++) {
                if (i < n - 1)
                    ADDOP(c, DUP_TOP);
                VISIT(c, expr,
                    (expr_ty)asdl_seq_GET(s->v.Assign.targets, i));
            }
            break;
    }
    ...
}

首先调用 VISIT(c, expr, s->v.Assign.value); 也就是 compiler_visit_expr 函数。 传入的参数为 Assign.value, 根据 AST value=Num(n=1)。可以在 gdb 中验证。

>> p e.kind
$2 = Num_kind
// Python/compiler.c compiler_visit_expr
static int
compiler_visit_expr(struct compiler *c, expr_ty e)
{
    switch (e->kind) {
        case Num_kind:
        ADDOP_O(c, LOAD_CONST, e->v.Num.n, consts);
        break;
    }
}

ADDOP_O 是一个宏,用于将 opcode 添加到 compiler 结构中。添加 opcode 有以下几个宏:

// 添加一个 opcode
#define ADDOP(C, OP) { \
    if (!compiler_addop((C), (OP))) \
        return 0; \
}


#define ADDOP_IN_SCOPE(C, OP) { \
    if (!compiler_addop((C), (OP))) { \
        compiler_exit_scope(c); \
        return 0; \
    } \
}

// 添加一个 opcode ,带一个没有具体名字的参数,`a=1` 中的 value(Num(1))
#define ADDOP_O(C, OP, O, TYPE) { \
    if (!compiler_addop_o((C), (OP), (C)->u->u_ ## TYPE, (O))) \
        return 0; \
}

// 添加一个 opcode, 带一个有名字的参数,如变量名
#define ADDOP_NAME(C, OP, O, TYPE) { \
    if (!compiler_addop_name((C), (OP), (C)->u->u_ ## TYPE, (O))) \
        return 0; \
}

#define ADDOP_I(C, OP, O) { \
    if (!compiler_addop_i((C), (OP), (O))) \
        return 0; \
}

#define ADDOP_JABS(C, OP, O) { \
    if (!compiler_addop_j((C), (OP), (O), 1)) \
        return 0; \
}

#define ADDOP_JREL(C, OP, O) { \
    if (!compiler_addop_j((C), (OP), (O), 0)) \
        return 0; \
}

所有带参数的 opcode 最后都会调用到 compiler_addop_i,AST 中的参数是数字或者变量名,到了 opcode 中都是一个 integer,为对应值在 compiler 结构中的下标。

对应的 opcode 定义在 include/opcode.h 中,如 opcode =100 的定义为 #define LOAD_CONST 100 /* Index in const list */

/* Add an opcode with an integer argument.
   Returns 0 on failure, 1 on success.
*/
static int
compiler_addop_i(struct compiler *c, int opcode, int oparg)
{
    struct instr *i;
    int off;
    off = compiler_next_instr(c, c->u->u_curblock);
    if (off < 0)
        return 0;
    i = &c->u->u_curblock->b_instr[off];
    i->i_opcode = opcode;
    i->i_oparg = oparg;
    i->i_hasarg = 1;
    compiler_set_lineno(c, off);
    return 1;
}

assemble

compile_mod 中,compiler_xxx 构建出了 CFG 调用 assemble 函数构建出最后的 CodeObject。

static PyCodeObject *
assemble(struct compiler *c, int addNone)
{
    basicblock *b, *entryblock;
    struct assembler a;
    int i, j, nblocks;
    PyCodeObject *co = NULL;

    /* Make sure every block that falls off the end returns None.
       XXX NEXT_BLOCK() isn't quite right, because if the last
       block ends with a jump or return b_next shouldn't set.
     */
    if (!c->u->u_curblock->b_return) {
        NEXT_BLOCK(c);
        if (addNone)
            ADDOP_O(c, LOAD_CONST, Py_None, consts);
        ADDOP(c, RETURN_VALUE);
    }

    // 找到 entryblock
    nblocks = 0;
    entryblock = NULL;
    for (b = c->u->u_blocks; b != NULL; b = b->b_list) {
        nblocks++;
        entryblock = b;
    }
    ...
    if (!assemble_init(&a, nblocks, c->u->u_firstlineno))
        goto error;
    dfs(c, entryblock, &a);

    /* Can't modify the bytecode after computing jump offsets. */
    assemble_jump_offsets(&a, c);

    /* Emit code in reverse postorder from dfs. */
    for (i = a.a_nblocks - 1; i >= 0; i--) {
        b = a.a_postorder[i];
        for (j = 0; j < b->b_iused; j++)
            if (!assemble_emit(&a, &b->b_instr[j]))
                goto error;
    }

    if (_PyString_Resize(&a.a_lnotab, a.a_lnotab_off) < 0)
        goto error;
    if (_PyString_Resize(&a.a_bytecode, a.a_offset) < 0)
        goto error;

    co = makecode(c, &a);
 error:
    assemble_free(&a);
    return co;
}

assemble 中先调用 dfs 以后序深度优先遍历 struct compiler(CFG),将 CFG 平坦化,然后调用 assemble_jump_offsets 生成跳转地址。最后调用 makecode 生成 CodeObject。

调试的过程中,可以使用 dis 模块来查看源码对应的字节码。将源码写入 test.py, 然后调用 python -m dis test.py 即可。

echo 'a=1' > test.py
python2 -m dis test.py                                                   
  1           0 LOAD_CONST               0 (1)
              3 STORE_NAME               0 (a)
              6 LOAD_CONST               1 (None)
              9 RETURN_VALUE

CodeObject 的结构定义在 include/code.h

/* Bytecode object */
typedef struct {
    PyObject_HEAD
    int co_argcount;		/* #arguments, except *args */
    int co_nlocals;		/* #local variables */
    int co_stacksize;		/* #entries needed for evaluation stack */
    int co_flags;		/* CO_..., see below */
    PyObject *co_code;		/* instruction opcodes */
    PyObject *co_consts;	/* list (constants used) */
    PyObject *co_names;		/* list of strings (names used) */
    PyObject *co_varnames;	/* tuple of strings (local variable names) */
    PyObject *co_freevars;	/* tuple of strings (free variable names) */
    PyObject *co_cellvars;      /* tuple of strings (cell variable names) */
    /* The rest doesn't count for hash/cmp */
    PyObject *co_filename;	/* string (where it was loaded from) */
    PyObject *co_name;		/* string (name, for reference) */
    int co_firstlineno;		/* first source line number */
    PyObject *co_lnotab;	/* string (encoding addr<->lineno mapping) See
				   Objects/lnotab_notes.txt for details. */
    void *co_zombieframe;     /* for optimization only (see frameobject.c) */
    PyObject *co_weakreflist;   /* to support weakrefs to code objects */
} PyCodeObject;

Eval CodeObject

run_mod 中构建出 CodeObject 后调用 PyEval_EvalCode 运行 CodeObject。 可以看到, PyEval_EvalCode 只是简单调用了 PyEval_EvalCodeEx

PyObject *
PyEval_EvalCode(PyCodeObject *co, PyObject *globals, PyObject *locals)
{
    return PyEval_EvalCodeEx(co,
                      globals, locals,
                      (PyObject **)NULL, 0,
                      (PyObject **)NULL, 0,
                      (PyObject **)NULL, 0,
                      NULL);
}

PyEval_EvalCodeEx 构建出 PyFrameObject 然后执行 PyEval_EvalFrameEx

// Python/ceval.c PyEval_EvalCodeEx

PyObject *
PyEval_EvalCodeEx(PyCodeObject *co, PyObject *globals, PyObject *locals,
           PyObject **args, int argcount, PyObject **kws, int kwcount,
           PyObject **defs, int defcount, PyObject *closure)
{
    register PyFrameObject *f;
    ...
    retval = PyEval_EvalFrameEx(f,0);
    ...
    return retval;
}

FrameObject

PyFrameObject 定义在 include/frameobject.h

typedef struct _frame {
    PyObject_VAR_HEAD
    struct _frame *f_back;	/* previous frame, or NULL */
    PyCodeObject *f_code;	/* code segment */
    PyObject *f_builtins;	/* builtin symbol table (PyDictObject) */
    PyObject *f_globals;	/* global symbol table (PyDictObject) */
    PyObject *f_locals;		/* local symbol table (any mapping) */
    PyObject **f_valuestack;	/* points after the last local */
    /* Next free slot in f_valuestack.  Frame creation sets to f_valuestack.
       Frame evaluation usually NULLs it, but a frame that yields sets it
       to the current stack top. */
    PyObject **f_stacktop;
    PyObject *f_trace;		/* Trace function */

    /* If an exception is raised in this frame, the next three are used to
     * record the exception info (if any) originally in the thread state.  See
     * comments before set_exc_info() -- it's not obvious.
     * Invariant:  if _type is NULL, then so are _value and _traceback.
     * Desired invariant:  all three are NULL, or all three are non-NULL.  That
     * one isn't currently true, but "should be".
     */
    PyObject *f_exc_type, *f_exc_value, *f_exc_traceback;

    PyThreadState *f_tstate;
    int f_lasti;		/* Last instruction if called */
    /* Call PyFrame_GetLineNumber() instead of reading this field
       directly.  As of 2.3 f_lineno is only valid when tracing is
       active (i.e. when f_trace is set).  At other times we use
       PyCode_Addr2Line to calculate the line from the current
       bytecode index. */
    int f_lineno;		/* Current line number */
    int f_iblock;		/* index in f_blockstack */
    PyTryBlock f_blockstack[CO_MAXBLOCKS]; /* for try and loop blocks */
    PyObject *f_localsplus[1];	/* locals+stack, dynamically sized */
} PyFrameObject;

可以看到 FrameObject 中存储了对应的字节码,local、global、builtin 三种变量,以及数据栈等运行时必须的信息。

PyTryBlock 中用来处理 Try 和 Loop(循环) 语句。在 break continue return 等时候可以跳转到 b_handler 记录的位置继续执行。

typedef struct {
    int b_type;			/* what kind of block this is */
    int b_handler;		/* where to jump to find handler */
    int b_level;		/* value stack level to pop to */
} PyTryBlock;

PyEval_EvalFrameEx

PyEval_EvalFrameEx 是 CPython 解释器最主要的求值函数,核心是一个循环里的巨大的 switch case,对不同的 opcode 执行不同的操作。

下面摘抄部分 PyEval_EvalFrameEx 代码,忽略 profiledebug 相关功能的代码。

可以看到在 PyEval_EvalFrameEx 的 for 循环中,先判断了锁的状态,确保同一时间只有一个线程访问解释器,然后通过 NEXTOP 等宏操作 next_instr 指针,以执行不同的字节码。

// Python/ceval.c PyEval_EvalFrameEx

PyObject *
PyEval_EvalFrameEx(PyFrameObject *f, int throwflag)
{
    ...
    register PyObject **stack_pointer;  /* Next free slot in value stack */
    register unsigned char *next_instr;
    register int opcode;        /* Current opcode */
    register int oparg;         /* Current opcode argument, if any */
    register enum why_code why; /* Reason for block stack unwind */
    register int err;           /* Error status -- nonzero if error */
    register PyObject *x;       /* Result object -- NULL if error */
    register PyObject *v;       /* Temporary objects popped off stack */
    register PyObject *w;
    register PyObject *u;
    register PyObject *t;
    register PyObject *stream = NULL;    /* for PRINT opcodes */
    register PyObject **fastlocals, **freevars;
    PyObject *retval = NULL;            /* Return value */
    PyThreadState *tstate = PyThreadState_GET();
    PyCodeObject *co;

    unsigned char *first_instr;
    PyObject *names;
    PyObject *consts;
    ...
    /* Start of code */

    if (f == NULL)
        return NULL;

    /* push frame */
    if (Py_EnterRecursiveCall(""))
        return NULL;

    tstate->frame = f;
    ...
    co = f->f_code;
    names = co->co_names;
    consts = co->co_consts;
    fastlocals = f->f_localsplus;
    freevars = f->f_localsplus + co->co_nlocals;
    first_instr = (unsigned char*) PyString_AS_STRING(co->co_code);
    next_instr = first_instr + f->f_lasti + 1;
    stack_pointer = f->f_stacktop;
    assert(stack_pointer != NULL);
    f->f_stacktop = NULL;       /* remains NULL unless yield suspends frame */
    ...
    why = WHY_NOT;
    err = 0;
    x = Py_None;        /* Not a reference, just anything non-NULL */
    w = NULL;

    for (;;) {
        ...
            if (interpreter_lock) {
                /* Give another thread a chance */

                if (PyThreadState_Swap(NULL) != tstate)
                    Py_FatalError("ceval: tstate mix-up");
                PyThread_release_lock(interpreter_lock);

                /* Other threads may run now */

                PyThread_acquire_lock(interpreter_lock, 1);

                if (PyThreadState_Swap(tstate) != NULL)
                    Py_FatalError("ceval: orphan tstate");

                /* Check for thread interrupts */

                if (tstate->async_exc != NULL) {
                    x = tstate->async_exc;
                    tstate->async_exc = NULL;
                    PyErr_SetNone(x);
                    Py_DECREF(x);
                    why = WHY_EXCEPTION;
                    goto on_error;
                }
            }
        ...
         /* Extract opcode and argument */

        opcode = NEXTOP();
        oparg = 0;   /* allows oparg to be stored in a register because
            it doesn't have to be remembered across a full loop */
        if (HAS_ARG(opcode))
            oparg = NEXTARG();
    dispatch_opcode:
        /* Main switch on opcode */

        switch (opcode) {

        /* BEWARE!
           It is essential that any operation that fails sets either
           x to NULL, err to nonzero, or why to anything but WHY_NOT,
           and that no operation that succeeds does this! */

        /* case STOP_CODE: this is an error! */

        TARGET_NOARG(NOP)
        {
            FAST_DISPATCH();
        }

        TARGET(LOAD_FAST)
        {
            x = GETLOCAL(oparg);
            if (x != NULL) {
                Py_INCREF(x);
                PUSH(x);
                FAST_DISPATCH();
            }
            format_exc_check_arg(PyExc_UnboundLocalError,
                UNBOUNDLOCAL_ERROR_MSG,
                PyTuple_GetItem(co->co_varnames, oparg));
            break;
        }
        ... // 省略剩余 opcode 对应的 case
    }/* switch */

        /* End the loop if we still have an error (or return) */

        if (why != WHY_NOT)
            break;

    } /* main loop */

    assert(why != WHY_YIELD);
    /* Pop remaining stack entries. */
    while (!EMPTY()) {
        v = POP();
        Py_XDECREF(v);
    }

    if (why != WHY_RETURN)
        retval = NULL;

    return retval;
}

其中 why 变量是一个表示 main loop 状态的枚举值。

/* Status code for main loop (reason for stack unwind) */
enum why_code {
        WHY_NOT =       0x0001, /* No error */
        WHY_EXCEPTION = 0x0002, /* Exception occurred */
        WHY_RERAISE =   0x0004, /* Exception re-raised by 'finally' */
        WHY_RETURN =    0x0008, /* 'return' statement */
        WHY_BREAK =     0x0010, /* 'break' statement */
        WHY_CONTINUE =  0x0020, /* 'continue' statement */
        WHY_YIELD =     0x0040  /* 'yield' operator */
};

其中 PUSH POP 都是对 stack_pointer 操作的宏,stack_pointer 指向 FrameObject 中的运行时栈,初始化时指向栈顶。

stack_pointer = f->f_stacktop;

NEXTOP 以及 JUMPTO 都是对 next_instr 进行操作的宏,如

#define NEXTOP() (*next_instr++)

控制下一个执行的 opcode

#define NEXTARG() (next_instr += 2, (next_instr[-1]<<8) + next_instr[-2]) 是从 opcode 中提出参数。

可以看出, CPython 虚拟机是基于栈、支持多线程和协程(yield),并且支持异常处理,和许多语言特性。