[ovs-dev,2/3] Add optional C extension wrapper for Python JSON parsing
diff mbox

Message ID 1465394115-67693-3-git-send-email-twilson@redhat.com
State Accepted
Headers show

Commit Message

Terry Wilson June 8, 2016, 1:55 p.m. UTC
The pure Python in-tree JSON parser is *much* slower than the
in-tree C JSON parser. A local test parsing a 100Mb JSON file
showed the Python version taking 270 seconds. With the C wrapper,
it took under 4 seconds.

The C extension will be used automatically if it can be built. If
the extension fails to build, a warning is displayed and the build
is restarted without the extension.

The Serializer class is replaced with Python's built-in
JSON library since the ability to process chunked data is not
needed in that case.

The extension should work with both Python 2.7 and Python 3.3+.

Signed-off-by: Terry Wilson <twilson@redhat.com>
---
 Makefile.am        |   2 +-
 python/automake.mk |   3 +
 python/ovs/_json.c | 268 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 python/ovs/json.py |  11 +++
 python/setup.py    |  51 +++++++++-
 5 files changed, 332 insertions(+), 3 deletions(-)
 create mode 100644 python/ovs/_json.c

Comments

Ben Pfaff June 8, 2016, 6:44 p.m. UTC | #1
On Wed, Jun 08, 2016 at 08:55:14AM -0500, Terry Wilson wrote:
> The pure Python in-tree JSON parser is *much* slower than the
> in-tree C JSON parser. A local test parsing a 100Mb JSON file
> showed the Python version taking 270 seconds. With the C wrapper,
> it took under 4 seconds.
> 
> The C extension will be used automatically if it can be built. If
> the extension fails to build, a warning is displayed and the build
> is restarted without the extension.
> 
> The Serializer class is replaced with Python's built-in
> JSON library since the ability to process chunked data is not
> needed in that case.
> 
> The extension should work with both Python 2.7 and Python 3.3+.
> 
> Signed-off-by: Terry Wilson <twilson@redhat.com>

Applied, thanks!

The automatic behavior here is an asset for a developer, but it is
probably undesirable for use in packaging systems, because one wants to
make sure that the packaging always has or does not have the extension,
without depending on what the system has on it.  So it might be good to
support something like --enable-openssl=[yes|no|check], where "yes"
means that OpenSSL must be available, "no" means that OpenSSL will not
be used, and "check" (the default) means to use OpenSSL if it's
available.

Patch
diff mbox

diff --git a/Makefile.am b/Makefile.am
index 69dbe3d..8cb8523 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -251,7 +251,7 @@  config-h-check:
 	@cd $(srcdir); \
 	if test -e .git && (git --version) >/dev/null 2>&1 && \
 	   git --no-pager grep -L '#include <config\.h>' `git ls-files | grep '\.c$$' | \
-               grep -vE '^datapath|^lib/sflow|^third-party|^datapath-windows'`; \
+               grep -vE '^datapath|^lib/sflow|^third-party|^datapath-windows|^python'`; \
 	then \
 	    echo "See above for list of violations of the rule that"; \
 	    echo "every C source file must #include <config.h>."; \
diff --git a/python/automake.mk b/python/automake.mk
index dc62422..ecad39d 100644
--- a/python/automake.mk
+++ b/python/automake.mk
@@ -46,6 +46,9 @@  EXTRA_DIST += \
 	python/README.rst \
 	python/setup.py
 
+# C extension support.
+EXTRA_DIST += python/ovs/_json.c
+
 PYFILES = $(ovs_pyfiles) python/ovs/dirs.py $(ovstest_pyfiles)
 EXTRA_DIST += $(PYFILES)
 PYCOV_CLEAN_FILES += $(PYFILES:.py=.py,cover)
diff --git a/python/ovs/_json.c b/python/ovs/_json.c
new file mode 100644
index 0000000..c4e2af3
--- /dev/null
+++ b/python/ovs/_json.c
@@ -0,0 +1,268 @@ 
+#include "Python.h"
+#include <openvswitch/lib/json.h>
+#include "structmember.h"
+
+#if PY_MAJOR_VERSION >= 3
+#define IS_PY3K
+#endif
+
+typedef struct {
+    PyObject_HEAD
+    struct json_parser *_parser;
+} json_ParserObject;
+
+static void
+Parser_dealloc(json_ParserObject * p)
+{
+    json_parser_abort(p->_parser);
+    Py_TYPE(p)->tp_free(p);
+}
+
+static PyObject *
+Parser_new(PyTypeObject * type, PyObject * args, PyObject * kwargs)
+{
+    json_ParserObject *self;
+    static char *kwlist[] = { "check_trailer", NULL };
+    PyObject *check_trailer = NULL;
+    int ct_int = 0;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O", kwlist,
+                                     &check_trailer)) {
+        return NULL;
+    }
+
+    if (check_trailer != NULL) {
+        ct_int = PyObject_IsTrue(check_trailer);
+        if (ct_int < 0) {
+            return NULL;
+        } else if (ct_int) {
+            ct_int = JSPF_TRAILER;
+        }
+    }
+
+    self = (json_ParserObject *) type->tp_alloc(type, 0);
+    if (self != NULL) {
+        self->_parser = json_parser_create(ct_int);
+    }
+
+    return (PyObject *) self;
+}
+
+static PyObject *
+Parser_feed(json_ParserObject * self, PyObject * args)
+{
+    Py_ssize_t input_sz;
+    PyObject *input;
+    size_t rd;
+    char *input_str;
+
+    if (self->_parser == NULL) {
+        return NULL;
+    }
+
+    if (!PyArg_UnpackTuple(args, "input", 1, 1, &input)) {
+        return NULL;
+    }
+#ifdef IS_PY3K
+    if ((input_str = PyUnicode_AsUTF8AndSize(input, &input_sz)) == NULL) {
+#else
+    if (PyString_AsStringAndSize(input, &input_str, &input_sz) < 0) {
+#endif
+        return NULL;
+    }
+
+    rd = json_parser_feed(self->_parser, input_str, (size_t) input_sz);
+
+#ifdef IS_PY3K
+    return PyLong_FromSize_t(rd);
+#else
+    return PyInt_FromSize_t(rd);
+#endif
+}
+
+static PyObject *
+Parser_is_done(json_ParserObject * self)
+{
+    if (self->_parser == NULL) {
+        return NULL;
+    }
+    return PyBool_FromLong(json_parser_is_done(self->_parser));
+}
+
+static PyObject *
+json_to_python(struct json *json)
+{
+    switch (json->type) {
+    case JSON_NULL:
+        Py_RETURN_NONE;
+    case JSON_FALSE:
+        Py_RETURN_FALSE;
+    case JSON_TRUE:
+        Py_RETURN_TRUE;
+    case JSON_OBJECT:{
+            struct shash_node *node;
+            PyObject *dict = PyDict_New();
+
+            if (dict == NULL) {
+                return PyErr_NoMemory();
+            }
+            SHASH_FOR_EACH(node, json->u.object) {
+                PyObject *key = PyUnicode_FromString(node->name);
+                PyObject *val = json_to_python(node->data);
+
+                if (!(key && val) || PyDict_SetItem(dict, key, val)) {
+                    Py_XDECREF(key);
+                    Py_XDECREF(val);
+                    Py_XDECREF(dict);
+                    return NULL;
+                }
+
+                Py_XDECREF(key);
+                Py_XDECREF(val);
+            }
+            return dict;
+        }
+    case JSON_ARRAY:{
+            int i;
+            PyObject *arr = PyList_New(json->u.array.n);
+
+            if (arr == NULL) {
+                return PyErr_NoMemory();
+            }
+            for (i = 0; i < json->u.array.n; i++) {
+                PyObject *item = json_to_python(json->u.array.elems[i]);
+
+                if (!item || PyList_SetItem(arr, i, item)) {
+                    Py_XDECREF(arr);
+                    return NULL;
+                }
+            }
+            return arr;
+        }
+    case JSON_REAL:
+        if (json->u.real != 0) {
+            return PyFloat_FromDouble(json->u.real);
+        } /* fall through to treat 0 as int */
+    case JSON_INTEGER:
+#ifdef IS_PY3K
+        return PyLong_FromLong((long) json->u.integer);
+#else
+        return PyInt_FromLong((long) json->u.integer);
+#endif
+
+    case JSON_STRING:
+        return PyUnicode_FromString(json->u.string);
+    default:
+        return NULL;
+    }
+}
+
+static PyObject *
+Parser_finish(json_ParserObject * self)
+{
+    struct json *json;
+    PyObject *obj;
+
+    if (self->_parser == NULL) {
+        return NULL;
+    }
+
+    json = json_parser_finish(self->_parser);
+    self->_parser = NULL;
+    obj = json_to_python(json);
+    return obj;
+}
+
+static PyMethodDef Parser_methods[] = {
+    {"feed", (PyCFunction) Parser_feed, METH_VARARGS,
+     "Feed data to the parser and return the index of the last object."},
+    {"is_done", (PyCFunction) Parser_is_done, METH_NOARGS,
+     "Whether the parser has finished decoding an object."},
+    {"finish", (PyCFunction) Parser_finish, METH_NOARGS,
+     "Finish parsing and return Python object parsed."},
+    {NULL},
+};
+
+static PyTypeObject json_ParserType = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+        "ovs._json.Parser",     /* tp_name */
+    sizeof (json_ParserObject), /* tp_basicsize */
+    0,                          /* tp_itemsize */
+    (destructor) Parser_dealloc,        /* tp_dealloc */
+    0,                          /* tp_print */
+    0,                          /* tp_getattr */
+    0,                          /* tp_setattr */
+    0,                          /* tp_compare */
+    0,                          /* tp_repr */
+    0,                          /* tp_as_number */
+    0,                          /* tp_as_sequence */
+    0,                          /* tp_as_mapping */
+    0,                          /* tp_hash */
+    0,                          /* tp_call */
+    0,                          /* tp_str */
+    0,                          /* tp_getattro */
+    0,                          /* tp_setattro */
+    0,                          /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,   /* tp_flags */
+    "Parser objects",           /* tp_doc */
+    0,                          /* tp_traverse */
+    0,                          /* tp_clear */
+    0,                          /* tp_richcompare */
+    0,                          /* tp_weaklistoffset */
+    0,                          /* tp_iter */
+    0,                          /* tp_iternext */
+    Parser_methods,             /* tp_methods */
+    0,                          /* tp_members */
+    0,                          /* tp_getset */
+    0,                          /* tp_base */
+    0,                          /* tp_dict */
+    0,                          /* tp_descr_get */
+    0,                          /* tp_descr_set */
+    0,                          /* tp_dictoffset */
+    0,                          /* tp_init */
+    0,                          /* tp_alloc */
+    Parser_new,                 /* tp_new */
+};
+
+#ifdef IS_PY3K
+static struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "ovs._json",                /* m_name */
+    "OVS JSON Parser module",   /* m_doc */
+    0,                          /* m_size */
+    0,                          /* m_methods */
+    0,                          /* m_slots */
+    0,                          /* m_traverse */
+    0,                          /* m_clear */
+    0,                          /* m_free */
+};
+
+#define INITERROR return NULL
+#else /* !IS_PY3K */
+#define INITERROR return
+#endif
+
+PyMODINIT_FUNC
+#ifdef IS_PY3K
+PyInit__json(void)
+#else
+init_json(void)
+#endif
+{
+    PyObject *m;
+
+    if (PyType_Ready(&json_ParserType) < 0) {
+        INITERROR;
+    }
+#ifdef IS_PY3K
+    m = PyModule_Create(&moduledef);
+#else
+    m = Py_InitModule3("ovs._json", NULL, "OVS JSON Parser module");
+#endif
+
+    Py_INCREF(&json_ParserType);
+    PyModule_AddObject(m, "Parser", (PyObject *) & json_ParserType);
+#ifdef IS_PY3K
+    return m;
+#endif
+}
diff --git a/python/ovs/json.py b/python/ovs/json.py
index ff986ea..ea0400a 100644
--- a/python/ovs/json.py
+++ b/python/ovs/json.py
@@ -18,6 +18,11 @@  import sys
 import six
 from six.moves import range
 
+try:
+    import ovs._json
+except ImportError:
+    pass
+
 __pychecker__ = 'no-stringiter'
 
 escapes = {ord('"'): u"\\\"",
@@ -165,6 +170,12 @@  class Parser(object):
     # Maximum height of parsing stack. #
     MAX_HEIGHT = 1000
 
+    def __new__(cls, *args, **kwargs):
+        try:
+            return ovs._json.Parser(*args, **kwargs)
+        except NameError:
+            return super(Parser, cls).__new__(cls)
+
     def __init__(self, check_trailer=False):
         self.check_trailer = check_trailer
 
diff --git a/python/setup.py b/python/setup.py
index 49c8c4e..19c1f18 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -13,6 +13,10 @@ 
 from __future__ import print_function
 import sys
 
+from distutils.command.build_ext import build_ext
+from distutils.errors import CCompilerError, DistutilsExecError, \
+    DistutilsPlatformError
+
 import setuptools
 
 VERSION = "unknown"
@@ -25,8 +29,33 @@  except IOError:
           file=sys.stderr)
     sys.exit(-1)
 
+ext_errors = (CCompilerError, DistutilsExecError, DistutilsPlatformError)
+if sys.platform == 'win32':
+    ext_errors += (IOError, ValueError)
+
+
+class BuildFailed(Exception):
+    pass
+
+
+class try_build_ext(build_ext):
+    # This class allows C extension building to fail
+    # NOTE: build_ext is not a new-style class
+
+    def run(self):
+        try:
+            build_ext.run(self)
+        except DistutilsPlatformError:
+            raise BuildFailed()
 
-setuptools.setup(
+    def build_extension(self, ext):
+        try:
+            build_ext.build_extension(self, ext)
+        except ext_errors:
+            raise BuildFailed()
+
+
+setup_args = dict(
     name='ovs',
     description='Open vSwitch library',
     version=VERSION,
@@ -46,5 +75,23 @@  setuptools.setup(
         'Programming Language :: Python :: 2.7',
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3.4',
-    ]
+    ],
+    ext_modules=[setuptools.Extension("ovs._json", sources=["ovs/_json.c"],
+                                      libraries=['openvswitch'])],
+    cmdclass={'build_ext': try_build_ext},
 )
+
+try:
+    setuptools.setup(**setup_args)
+except BuildFailed:
+    BUILD_EXT_WARNING = ("WARNING: The C extension could not be compiled, "
+                         "speedups are not enabled.")
+    print("*" * 75)
+    print(BUILD_EXT_WARNING)
+    print("Failure information, if any, is above.")
+    print("Retrying the build without the C extension.")
+    print("*" * 75)
+
+    del(setup_args['cmdclass'])
+    del(setup_args['ext_modules'])
+    setuptools.setup(**setup_args)