之前看了 a-python-interpreter-written-in-pythonbyterun,就想试试用 JAVA 解析 Python 生成的 pyc 文件,读取 bytecode 后在 JAVA 中实现解释执行。

要解析 pyc 文件,就需要知道其来龙去脉,以及是如何生成的。

pyc

根据平时编写 Python 代码的经验,pyc 文件是在我们 import 一个模块后生成的。

imp module

而官方文档中提到了 imp 模块是用来和 import 语句的具体实现机制交互的。其中:

  • find_module 函数负责到 sys.path 中寻找对应的 module
  • 若存在需要的 module,则调用 load_module 加载对应 module

根据之前分析 CPython 源码的经验, 标准库模块中和运行逻辑相关的函数一般对应着一个 CPython 解释器中的 C 代码实现。

import.c

如 load_module 就位于 https://github.com/python/cpython/blob/2.7/Python/import.c#L1929

/* Load an external module using the default search path and return
   its module object WITH INCREMENTED REFERENCE COUNT */

static PyObject *
load_module(char *name, FILE *fp, char *pathname, int type, PyObject *loader)
{
    PyObject *modules;
    PyObject *m;
    int err;

    /* First check that there's an open file (if we need one)  */
    switch (type) {
    case PY_SOURCE:
    case PY_COMPILED:
        if (fp == NULL) {
            PyErr_Format(PyExc_ValueError,
               "file object required for import (type code %d)",
                         type);
            return NULL;
        }
    }

    switch (type) {

    case PY_SOURCE:
        m = load_source_module(name, pathname, fp);
        break;

    case PY_COMPILED:
        m = load_compiled_module(name, pathname, fp);
        break;

#ifdef HAVE_DYNAMIC_LOADING
    case C_EXTENSION:
        m = _PyImport_LoadDynamicModule(name, pathname, fp);
        break;
#endif

    case PKG_DIRECTORY:
        m = load_package(name, pathname);
        break;

    case C_BUILTIN:
    case PY_FROZEN:
        if (pathname != NULL && pathname[0] != '\0')
            name = pathname;
        if (type == C_BUILTIN)
            err = init_builtin(name);
        else
            err = PyImport_ImportFrozenModule(name);
        if (err < 0)
            return NULL;
        if (err == 0) {
            PyErr_Format(PyExc_ImportError,
                         "Purported %s module %.200s not found",
                         type == C_BUILTIN ?
                                    "builtin" : "frozen",
                         name);
            return NULL;
        }
        modules = PyImport_GetModuleDict();
        m = PyDict_GetItemString(modules, name);
        if (m == NULL) {
            PyErr_Format(
                PyExc_ImportError,
                "%s module %.200s not properly initialized",
                type == C_BUILTIN ?
                    "builtin" : "frozen",
                name);
            return NULL;
        }
        Py_INCREF(m);
        break;

    case IMP_HOOK: {
        if (loader == NULL) {
            PyErr_SetString(PyExc_ImportError,
                            "import hook without loader");
            return NULL;
        }
        m = PyObject_CallMethod(loader, "load_module", "s", name);
        break;
    }

    default:
        PyErr_Format(PyExc_ImportError,
                     "Don't know how to import %.200s (type code %d)",
                      name, type);
        m = NULL;

    }

    return m;
}

可以看到 load_module 会检查找到的 module 是PY_SOURCE还是 PY_COMPILED,而这两个宏分别对应着 .py.pyc 文件。

#ifdef RISCOS
static const struct filedescr _PyImport_StandardFiletab[] = {
    {"/py", "U", PY_SOURCE},
    {"/pyc", "rb", PY_COMPILED},
    {0, 0}
};
#else
static const struct filedescr _PyImport_StandardFiletab[] = {
    {".py", "U", PY_SOURCE},
#ifdef MS_WINDOWS
    {".pyw", "U", PY_SOURCE},
#endif
    {".pyc", "rb", PY_COMPILED},
    {0, 0}
};
#endif

我们跟入在没有 .pyc 文件时加载 .py 源文件的 load_source_module 函数(只摘录了一部分)。

https://github.com/python/cpython/blob/2.7/Python/import.c#L1076

/* Load a source module from a given file and return its module
   object WITH INCREMENTED REFERENCE COUNT.  If there's a matching
   byte-compiled file, use that instead. */

static PyObject *
load_source_module(char *name, char *pathname, FILE *fp)
{
.....
    cpathname = make_compiled_pathname(pathname, buf,
                                       (size_t)MAXPATHLEN + 1);
    if (cpathname != NULL &&
        (fpc = check_compiled_module(pathname, mtime, cpathname))) {
    }
    else {
        co = parse_source_module(pathname, fp);
        if (co == NULL)
            goto error_exit;
        if (Py_VerboseFlag)
            PySys_WriteStderr("import %s # from %s\n",
                name, pathname);
        if (cpathname) {
            PyObject *ro = PySys_GetObject("dont_write_bytecode");
            int b = (ro == NULL) ? 0 : PyObject_IsTrue(ro);
            if (b < 0)
                goto error_exit;
            if (!b)
                write_compiled_module(co, cpathname, &st, mtime);
        }
    }
    m = PyImport_ExecCodeModuleEx(name, (PyObject *)co, pathname);
    Py_DECREF(co);

    PyMem_FREE(buf);
    return m;

error_exit:
    Py_XDECREF(co);
    PyMem_FREE(buf);
    return NULL;
}
}

可以看到 load_source_module 同样会去找一次 .pyc 文件,再找不到的情况下,会先解析源文件, 得到 codeobject 后调用 write_compiled_module 生成 .pyc 文件,再执行 import 逻辑。

write_compiled_module

所以,write_compiled_module 函数中应该就对应着我们的 pyc 文件生成逻辑了。

https://github.com/python/cpython/blob/2.7/Python/import.c#L951

static void
write_compiled_module(PyCodeObject *co, char *cpathname, struct stat *srcstat, time_t mtime)
{
    FILE *fp;
#ifdef MS_WINDOWS   /* since Windows uses different permissions  */
    mode_t mode = srcstat->st_mode & ~S_IEXEC;
    /* Issue #6074: We ensure user write access, so we can delete it later
     * when the source file changes. (On POSIX, this only requires write
     * access to the directory, on Windows, we need write access to the file
     * as well)
     */
    mode |= _S_IWRITE;
#else
    mode_t mode = srcstat->st_mode & ~S_IXUSR & ~S_IXGRP & ~S_IXOTH;
#endif

    fp = open_exclusive(cpathname, mode);
    if (fp == NULL) {
        if (Py_VerboseFlag)
            PySys_WriteStderr(
                "# can't create %s\n", cpathname);
        return;
    }
    PyMarshal_WriteLongToFile(pyc_magic, fp, Py_MARSHAL_VERSION);
    /* First write a 0 for mtime */
    PyMarshal_WriteLongToFile(0L, fp, Py_MARSHAL_VERSION);
    PyMarshal_WriteObjectToFile((PyObject *)co, fp, Py_MARSHAL_VERSION);
    if (fflush(fp) != 0 || ferror(fp)) {
        if (Py_VerboseFlag)
            PySys_WriteStderr("# can't write %s\n", cpathname);
        /* Don't keep partial file */
        fclose(fp);
        (void) unlink(cpathname);
        return;
    }
    /* Now write the true mtime (as a 32-bit field) */
    fseek(fp, 4L, 0);
    assert(mtime <= 0xFFFFFFFF);
    PyMarshal_WriteLongToFile((long)mtime, fp, Py_MARSHAL_VERSION);
    fflush(fp);
    fclose(fp);
    if (Py_VerboseFlag)
        PySys_WriteStderr("# wrote %s\n", cpathname);
}

可以看到,pyc 文件的生成大致分下面几步:

  • 1.创建目标 pyc 文件
  • 2.首先调用 PyMarshal_WriteLongToFile 序列化 magic number 到文件中
  • 3.然后序列化一个空的时间戳到文件中
  • 4.PyMarshal_WriteObjectToFile 将 PyCodeObject 序列化到文件
  • 5.写完 CodeObject 后,fseek 到时间戳的位置,填充真实的时间戳

其中,magic number 定义于 import.c 头部

/*
    Python 2.7a0: 62171 (optimize list comprehensions/change LIST_APPEND)
    Python 2.7a0: 62181 (optimize conditional branches:
                introduce POP_JUMP_IF_FALSE and POP_JUMP_IF_TRUE)
    Python 2.7a0  62191 (introduce SETUP_WITH)
    Python 2.7a0  62201 (introduce BUILD_SET)
    Python 2.7a0  62211 (introduce MAP_ADD and SET_ADD)
.
*/
#define MAGIC (62211 | ((long)'\r'<<16) | ((long)'\n'<<24))

/* Magic word as global; note that _PyImport_Init() can change the
   value of this global to accommodate for alterations of how the
   compiler works which are enabled by command line switches. */
static long pyc_magic = MAGIC;

PyMarshal_WriteLongToFilePyMarshal_WriteObjectToFile 定义于 marshal.c 中。

marshal.c

PyMarshal_WriteLongToFile

我们先来看 PyMarshal_WriteLongToFile https://github.com/python/cpython/blob/2.7/Python/marshal.c#L462

/* version currently has no effect for writing longs. */
void
PyMarshal_WriteLongToFile(long x, FILE *fp, int version)
{
    WFILE wf;
    wf.fp = fp;
    wf.str = NULL;
    wf.ptr = NULL;
    wf.end = NULL;
    wf.error = WFERR_OK;
    wf.depth = 0;
    wf.strings = NULL;
    wf.version = version;
    w_long(x, &wf);
}

PyMarshal_WriteLongToFile 创建了 WFILE,将打开的文件描述符赋值给 WFILE,并调用 w_long。

用于表示写入的 pyc 文件的 WFILE 结构如下。

typedef struct {
    FILE *fp;
    int error;  /* see WFERR_* values */
    int depth;
    /* If fp == NULL, the following are valid: */
    PyObject *str;
    char *ptr;
    char *end;
    PyObject *strings; /* dict on marshal, list on unmarshal */
    int version;
} WFILE;

跟入 w_long

static void
w_long(long x, WFILE *p)
{
    w_byte((char)( x      & 0xff), p);
    w_byte((char)((x>> 8) & 0xff), p);
    w_byte((char)((x>>16) & 0xff), p);
    w_byte((char)((x>>24) & 0xff), p);
}

可以看到 w_long 只是调用了四次 w_byte 将一个 type 为 long ,长度为4字节的数写入到文件中。

#define w_byte(c, p) if (((p)->fp)) putc((c), (p)->fp); \
                      else if ((p)->ptr != (p)->end) *(p)->ptr++ = (c); \
                           else w_more(c, p)

w_byte 宏简单的将传入的一字节内容写入到 WFILE->fp ,即对应的 pyc 文件中。marshal 中的序列化写入操作都是基于 w_byte 封装的。

PyMarshal_WriteObjectToFile

PyMarshal_WriteObjectToFile 相对之前的 PyMarshal_WriteLongToFile 更加的复杂了,用于将 Python 对象序列化到文件中。

void
PyMarshal_WriteObjectToFile(PyObject *x, FILE *fp, int version)
{
    WFILE wf;
    wf.fp = fp;
    wf.str = NULL;
    wf.ptr = NULL;
    wf.end = NULL;
    wf.error = WFERR_OK;
    wf.depth = 0;
    wf.strings = (version > 0) ? PyDict_New() : NULL;
    wf.version = version;
    w_object(x, &wf);
    Py_XDECREF(wf.strings);
}

可以看到 PyMarshal_WriteObjectToFile 调用的是 w_object ,是 marshal 最复杂的一个函数。

https://github.com/python/cpython/blob/2.7/Python/marshal.c#L212

static void
w_object(PyObject *v, WFILE *p)
{
    Py_ssize_t i, n;

    p->depth++;

    if (p->depth > MAX_MARSHAL_STACK_DEPTH) {
        p->error = WFERR_NESTEDTOODEEP;
    }
    else if (v == NULL) {
        w_byte(TYPE_NULL, p);
    }
    else if (v == Py_None) {
        w_byte(TYPE_NONE, p);
    }
    else if (v == PyExc_StopIteration) {
        w_byte(TYPE_STOPITER, p);
    }
    else if (v == Py_Ellipsis) {
        w_byte(TYPE_ELLIPSIS, p);
    }
    else if (v == Py_False) {
        w_byte(TYPE_FALSE, p);
    }
    else if (v == Py_True) {
        w_byte(TYPE_TRUE, p);
    }
    else if (PyInt_CheckExact(v)) {
        long x = PyInt_AS_LONG((PyIntObject *)v);
        w_byte(TYPE_INT, p);
        w_long(x, p);
    }
    else if (PyLong_CheckExact(v)) {
        PyLongObject *ob = (PyLongObject *)v;
        w_PyLong(ob, p);
    }
    ....
    else if (PyCode_Check(v)) {
        PyCodeObject *co = (PyCodeObject *)v;
        w_byte(TYPE_CODE, p);
        w_long(co->co_argcount, p);
        w_long(co->co_nlocals, p);
        w_long(co->co_stacksize, p);
        w_long(co->co_flags, p);
        w_object(co->co_code, p);
        w_object(co->co_consts, p);
        w_object(co->co_names, p);
        w_object(co->co_varnames, p);
        w_object(co->co_freevars, p);
        w_object(co->co_cellvars, p);
        w_object(co->co_filename, p);
        w_object(co->co_name, p);
        w_long(co->co_firstlineno, p);
        w_object(co->co_lnotab, p);
    }
    else {
        w_byte(TYPE_UNKNOWN, p);
        p->error = WFERR_UNMARSHALLABLE;
    }
   exit:
    p->depth--;
}

w_object 的主要逻辑为读取传入的 PyObject *v 的具体类型,调用 w_byte 写入一个字节的类型数据,然后调用不同的 w_ 系列函数序列化对应类型的数据。

这里我们省略其他类型的代码,重点看下 PyCodeObject 类型的处理。可以看到,w_object 只是简单的讲 PyCodeObject 中每个类变量依次序列化到文件中,我们只需要按照 type object 的顺序去反序列化即可得到对应的内容。

TYPE 相关的宏定义于marshal.c#L27

#define TYPE_NULL               '0'
#define TYPE_NONE               'N'
#define TYPE_FALSE              'F'
#define TYPE_TRUE               'T'
#define TYPE_STOPITER           'S'
#define TYPE_ELLIPSIS           '.'
#define TYPE_INT                'i'
#define TYPE_INT64              'I'
#define TYPE_FLOAT              'f'
#define TYPE_BINARY_FLOAT       'g'
#define TYPE_COMPLEX            'x'
#define TYPE_BINARY_COMPLEX     'y'
#define TYPE_LONG               'l'
#define TYPE_STRING             's'
#define TYPE_INTERNED           't'
#define TYPE_STRINGREF          'R'
#define TYPE_TUPLE              '('
#define TYPE_LIST               '['
#define TYPE_DICT               '{'
#define TYPE_CODE               'c'
#define TYPE_UNICODE            'u'
#define TYPE_UNKNOWN            '?'
#define TYPE_SET                '<'
#define TYPE_FROZENSET          '>'

使用 JAVA 反序列化 pyc 文件参考 PycFile.java

pyc 文件结构(Struct of pyc)

根据上面的分析,我们可以得出 pyc 文件的格式如下,其中 PyCodeObject 部分为变长,需要参考 w_object 进行反序列化。

----------
magic number 4 bytes
----------
timestamp 4 bytes
----------
PyCodeObject

PyCodeObject

根据上面的分析,我们知道了 pyc 文件中最主要的内容为序列化的 PyCodeObject,接下来我们就分析一下 PyCodeObject 的结构,以及如何生成及如何被解释执行。

PyCodeObject 定义于 Include/code.h#L10

/* Bytecode object */
typedef struct {
    PyObject_HEAD
    int co_argcount;		/* #arguments, except *args */
    int co_nlocals;		/* #local variables */
    int co_stacksize;		/* #entries needed for evaluation stack */
    int co_flags;		/* CO_..., see below */
    PyObject *co_code;		/* instruction opcodes */
    PyObject *co_consts;	/* list (constants used) */
    PyObject *co_names;		/* list of strings (names used) */
    PyObject *co_varnames;	/* tuple of strings (local variable names) */
    PyObject *co_freevars;	/* tuple of strings (free variable names) */
    PyObject *co_cellvars;      /* tuple of strings (cell variable names) */
    /* The rest doesn't count for hash/cmp */
    PyObject *co_filename;	/* string (where it was loaded from) */
    PyObject *co_name;		/* string (name, for reference) */
    int co_firstlineno;		/* first source line number */
    PyObject *co_lnotab;	/* string (encoding addr<->lineno mapping) See
				   Objects/lnotab_notes.txt for details. */
    void *co_zombieframe;     /* for optimization only (see frameobject.c) */
    PyObject *co_weakreflist;   /* to support weakrefs to code objects */
} PyCodeObject;

上面 load_source_module 中可以看到 pyc 文件的 PyCodeObject 是调用 parse_source_module 生成的。

parse_source_module

static PyCodeObject *
parse_source_module(const char *pathname, FILE *fp)
{
    PyCodeObject *co = NULL;
    mod_ty mod;
    PyCompilerFlags flags;
    PyArena *arena = PyArena_New();
    if (arena == NULL)
        return NULL;

    flags.cf_flags = 0;

    mod = PyParser_ASTFromFile(fp, pathname, Py_file_input, 0, 0, &flags,
                               NULL, arena);
    if (mod) {
        co = PyAST_Compile(mod, pathname, NULL, arena);
    }
    PyArena_Free(arena);
    return co;
}

我们在第一篇CPython源码阅读笔记(1) 中曾经分析从 PyParser_ASTFromString 开始的代码生成流程,这里的逻辑和之前一致。

即在 compile_mod 阶段划分好了 CFG ,然后按照 CFG 遍历生成 PyCodeObject。其中最外层为一个入口的 Block,嵌套的生成多个 code object。

代码生成测试

创建 test.py 如下

def test1(a, b):
    return a+b

c = test1(1,2)

在同级目录启动一个 Python 终端。

>>> f = open('test.py').read()
>>> co = compile(f, 'test.py', 'exec')
>>> import dis
>>> dis.dis(co)
 1           0 LOAD_CONST               0 (<code object test1 at 0x1028d15b0, file "test.py", line 1>)
              3 MAKE_FUNCTION            0
              6 STORE_NAME               0 (test1)

  4           9 LOAD_NAME                0 (test1)
             12 LOAD_CONST               1 (1)
             15 LOAD_CONST               2 (2)
             18 CALL_FUNCTION            2
             21 STORE_NAME               1 (c)
             24 LOAD_CONST               3 (None)
             27 RETURN_VALUE

可以看到 test 函数的生成了单独的一个 code object。

查看最外层 code object 的 co_const 后找到了对应的 test 函数的 code object 。

>>> co.co_consts
(<code object test1 at 0x1028d15b0, file "test.py", line 1>, 1, 2, None)

接着我们可以根据 PyCodeObject 的各个属性的名字猜测并查看其内容。

>>> co_test1 = co.co_consts[0]

>>> co_test1.co_argcount
2

>>> co_test1.co_varname
('a', 'b')

>>> co_test1.co_code
'|\x00\x00|\x01\x00\x17S'

>>> dis.dis(co_test1)
  2           0 LOAD_FAST                0 (a)
              3 LOAD_FAST                1 (b)
              6 BINARY_ADD
              7 RETURN_VALUE

调试

按照第一篇文章中的方法,我们可以试着调试一下 test.py 的编译过程。

compiler_mod

在编译的入口函数 compiler_mod 处下断点,运行 test.py

> gdb python
(gdb) b compiler_mod
Breakpoint 1 at 0xe4602: file Python/compile.c, line 1219.
(gdb) r test.py
Starting program: /mnt/e/codes/Python-2.7.10/python.exe test.py
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".

compiler_body

单步跟入 compiler_mod 函数,可以看到传入的 mod 为 Module_kind,所以接下来跟入 compiler_body

(gdb) s
compiler_mod (mod=0x8695e08, c=0x7ffffffee620) at Python/compile.c:1219
1219        if (!module) {
(gdb) l
1214    compiler_mod(struct compiler *c, mod_ty mod)
1215    {
1216        PyCodeObject *co;
1217        int addNone = 1;
1218        static PyObject *module;
1219        if (!module) {
1220            module = PyString_InternFromString("<module>");
1221            if (!module)
1222                return NULL;
1223        }
(gdb) p c
$1 = (struct compiler *) 0x7ffffffee620
(gdb) p *c
$2 = {c_filename = 0x7ffffffeebbd "test.py", c_st = 0x863b190, c_future = 0x7fffff7c94a0, c_flags = 0x7ffffffee7dc, c_interactive = 0,
  c_nestlevel = 0, u = 0x0, c_stack = 0x7fffff6a6f80, c_arena = 0x864e520}
(gdb) p mod
$3 = (mod_ty) 0x8695e08
(gdb) p *mod
$4 = {kind = Module_kind, v = {Module = {body = 0x8695ab0}, Interactive = {body = 0x8695ab0}, Expression = {body = 0x8695ab0}, Suite = {
      body = 0x8695ab0}}}

compiler_mod

static PyCodeObject *
compiler_mod(struct compiler *c, mod_ty mod)
{
    PyCodeObject *co;
    int addNone = 1;
    static PyObject *module;
    if (!module) {
        module = PyString_InternFromString("<module>");
        if (!module)
            return NULL;
    }
    /* Use 0 for firstlineno initially, will fixup in assemble(). */
    if (!compiler_enter_scope(c, module, mod, 0))
        return NULL;
    switch (mod->kind) {
    case Module_kind:
        if (!compiler_body(c, mod->v.Module.body)) {
            compiler_exit_scope(c);
            return 0;
        }
        break;
    case Interactive_kind:
        c->c_interactive = 1;
        VISIT_SEQ_IN_SCOPE(c, stmt,
                                mod->v.Interactive.body);
        break;
    case Expression_kind:
        VISIT_IN_SCOPE(c, expr, mod->v.Expression.body);
        addNone = 0;
        break;
    case Suite_kind:
        PyErr_SetString(PyExc_SystemError,
                        "suite should not be possible");
        return 0;
    default:
        PyErr_Format(PyExc_SystemError,
                     "module kind %d should not be possible",
                     mod->kind);
        return 0;
    }
    co = assemble(c, addNone);
    compiler_exit_scope(c);
    return co;
}

compiler_body 处下断点,然后跟入该函数。

(gdb) b compiler_body
Breakpoint 2 at 0x80e392a: compiler_body. (2 locations)
(gdb) c
Continuing.

Breakpoint 2, compiler_mod (mod=0x8695e08, c=0x7ffffffee620) at Python/compile.c:1229
1229            if (!compiler_body(c, mod->v.Module.body)) {
(gdb) s
compiler_body (stmts=0x8695ab0, c=0x7ffffffee620) at Python/compile.c:1198
1198        if (!asdl_seq_LEN(stmts))

可以看到,compiler_body 只是简单的讲 stmts 中的元素取出,通过 VISIT 宏进行代码生成。

static int
compiler_body(struct compiler *c, asdl_seq *stmts)
{
    int i = 0;
    stmt_ty st;

    if (!asdl_seq_LEN(stmts))
        return 1;
    st = (stmt_ty)asdl_seq_GET(stmts, 0);
    if (compiler_isdocstring(st) && Py_OptimizeFlag < 2) {
        /* don't generate docstrings if -OO */
        i = 1;
        VISIT(c, expr, st->v.Expr.value);
        if (!compiler_nameop(c, __doc__, Store))
            return 0;
    }
    for (; i < asdl_seq_LEN(stmts); i++)
        VISIT(c, stmt, (stmt_ty)asdl_seq_GET(stmts, i));
    return 1;
}
#define VISIT(C, TYPE, V) {\
    if (!compiler_visit_ ## TYPE((C), (V))) \
        return 0; \
}

VISIT(c, stmt, (stmt_ty)asdl_seq_GET(stmts, i)); 展开其实就是compiler_visit_stmt(c, (stmt_ty)asdl_seq_GET(stmts, i))

单步跟入循环中的 VISIT 调用,查看传入的 stmt 参数,为 include/Python-ast.h 中定义的 struct _stmt,即 stmt AST Node(stmt 的语法树节点)。

(gdb) s
compiler_visit_stmt (c=0x7ffffffee620, s=0x8695c58) at Python/compile.c:2117
2117        c->u->u_lineno = s->lineno;
(gdb) p *s
$19 = {kind = FunctionDef_kind, v = {FunctionDef = {name = 0x7fffff67a480, args = 0x8695b50, body = 0x8695b70, decorator_list = 0x0},
   ....

Python-ast.h 中定义了 Python 中 stmt 的类型。 https://github.com/python/cpython/blob/2.7/Include/Python-ast.h#L62

enum _stmt_kind {FunctionDef_kind=1, ClassDef_kind=2, Return_kind=3,
                  Delete_kind=4, Assign_kind=5, AugAssign_kind=6, Print_kind=7,
                  For_kind=8, While_kind=9, If_kind=10, With_kind=11,
                  Raise_kind=12, TryExcept_kind=13, TryFinally_kind=14,
                  Assert_kind=15, Import_kind=16, ImportFrom_kind=17,
                  Exec_kind=18, Global_kind=19, Expr_kind=20, Pass_kind=21,
                  Break_kind=22, Continue_kind=23};

compiler_visit_stmt

https://github.com/python/cpython/blob/2.7/Python/compile.c#L2074

static int
compiler_visit_stmt(struct compiler *c, stmt_ty s)
{
    int i, n;

    /* Always assign a lineno to the next instruction for a stmt. */
    c->u->u_lineno = s->lineno;
    c->u->u_lineno_set = false;

    switch (s->kind) {
    case FunctionDef_kind:
        return compiler_function(c, s);
    ...
    }
    return 1;
}

因为这里第一次传入的 stmt 的类型为 FunctionDef_kind,这里会调用 compiler_function

(gdb) n
2120        switch (s->kind) {
(gdb) n
2122            return compiler_function(c, s);

compiler_function

跟入 compiler_function, 这里即是真正的代码生成逻辑。

https://github.com/python/cpython/blob/2.7/Python/compile.c#L1351

static int
compiler_function(struct compiler *c, stmt_ty s)
{
    PyCodeObject *co;
    PyObject *first_const = Py_None;
    arguments_ty args = s->v.FunctionDef.args;
    asdl_seq* decos = s->v.FunctionDef.decorator_list;
    stmt_ty st;
    int i, n, docstring;

    assert(s->kind == FunctionDef_kind);

    if (!compiler_decorators(c, decos))
        return 0;
    if (args->defaults)
        VISIT_SEQ(c, expr, args->defaults);
    if (!compiler_enter_scope(c, s->v.FunctionDef.name, (void *)s,
                              s->lineno))
        return 0;

    st = (stmt_ty)asdl_seq_GET(s->v.FunctionDef.body, 0);

    /* unpack nested arguments */
    compiler_arguments(c, args);

    c->u->u_argcount = asdl_seq_LEN(args->args);
    n = asdl_seq_LEN(s->v.FunctionDef.body);
    ....
    co = assemble(c, 1);
    compiler_exit_scope(c);
    if (co == NULL)
        return 0;

    compiler_make_closure(c, co, asdl_seq_LEN(args->defaults));
    Py_DECREF(co);

    for (i = 0; i < asdl_seq_LEN(decos); i++) {
        ADDOP_I(c, CALL_FUNCTION, 1);
    }

    return compiler_nameop(c, s->v.FunctionDef.name, Store);
}

这里逻辑比较复杂,就不贴调试的过程了。大致的流程为

  • 将 FuncDef AST Node 中的一些 metadata 存储到 compiler 对象中。
  • 调用 assemble 将函数体生成单独的 code object。
  • 调用 compiler_make_closure 生成 LOAD_CONSTMAKE_FUNCTION 两个opcode。
static int
compiler_make_closure(struct compiler *c, PyCodeObject *co, int args)
{
    int i, free = PyCode_GetNumFree(co);
    if (free == 0) {
        ADDOP_O(c, LOAD_CONST, (PyObject*)co, consts);
        ADDOP_I(c, MAKE_FUNCTION, args);
        return 1;
    }
    ...
}
  • 调用 compiler_nameop 生成 STORE_NAME opcode。
 static int
compiler_nameop(struct compiler *c, identifier name, expr_context_ty ctx)
{
    int op, scope, arg;
    enum { OP_FAST, OP_GLOBAL, OP_DEREF, OP_NAME } optype;
    ....
    op = 0;
    optype = OP_NAME;
    ....
    switch (optype) {
     case OP_NAME:
        switch (ctx) {
        case Load: op = LOAD_NAME; break;
        case Store: op = STORE_NAME; break;
        case Del: op = DELETE_NAME; break;
        case AugLoad:
        case AugStore:
            break;
        case Param:
        default:
            PyErr_SetString(PyExc_SystemError,
                            "param invalid for name variable");
            return 0;
        }
        break;
    }

    assert(op);
    arg = compiler_add_o(c, dict, mangled);
    Py_DECREF(mangled);
    if (arg < 0)
        return 0;
    return compiler_addop_i(c, op, arg);
}

至此生成了下面的字节码

LOAD_CONST               0 
MAKE_FUNCTION            0
STORE_NAME               0

对应源码中的

def test1(a, b):
    return a+b

Py_OPCODE

字节码对应的数字定于于 opcode.h

其中 HAS_ARG 宏定义了字节码是否带有参数(通过判断字节码对应的数字是否大于指定的值)。

#define HAS_ARG(op) ((op) >= HAVE_ARGUMENT)

在 Python2.7 中这个值为 90

#define HAVE_ARGUMENT	90	/* Opcodes from here have an argument: */