[Pyrex] faster in / output from objects [long post + code!]

Martijn Meijers w3-pyrex at zw9.nl
Thu Jan 24 13:24:25 CET 2008


Dear list members,

Currently I'm working in the geo-informatics field and I'm doing 
research on storage of vector data in a DBMS. For this my programming 
language of choice is Python. . Although there are some vector libraries 
in C with Python bindings, I feel that those are  not really comfortable 
to work with (due to their API).Therefore, I decided to roll my own 
library for educational and research purposes and I'm using Cython for 
this purpose (as I'm not really proficient in C or C++, and I'm not 
really willing to go that route, as it involves quite a steep learning 
curve).

Below, you'll find my library that I created. Creation of objects is 
fairly fast, compared with the C-lib-with-python bindings that I have 
available for comparison (my approach is around 1.5 times faster with 
object creation). However, I'm stuck with in/output of my objects: Two 
formats I'd like to support: a text based format and a binary format. 
Here, I have the feeling I don't understand how I can use Cython to push 
the throughput to the limits. My approach (with Visitors) is fairly 
slow. As I understand it, Cython is more geared towards (mathematical) 
computations, then to text processing...

I'd like to know some things about my code:
(a) Did I do things the right way, or can the code be optimized more 
(while staying in Cython)?
(b) Is it possible to speed up the in- and output of text and binary 
formats (here a lot of python functions are still used, but I can't seem 
to find examples of how to do text/binary stream processing with Cython)...?

Thanks very much for your time and advice in advance!

Kind regards,

Martijn Meijers
Delft University of Technology, The Netherlands
OTB, Section GIS-technology


# ------------------------------------------------------
# lwgeom2d.pyx

#
# Light-Weight Geometry 2D for Python (with Cython)
# Martijn Meijers
# Init date: 18 January 2008
#
from cStringIO import StringIO
from struct import pack, unpack

#===============================================================================
# We need this visitor class here, so we can reference it in geometries
# TODO: split out in separate, logical, packages
#===============================================================================
cdef class Visitor:
    def __init__(self):
        if self.__class__ is Visitor:
            raise NotImplementedError
   
    def visit(self, item):
        raise NotImplementedError

#===============================================================================
# Geometry types defined in Simple Feature Specification
#===============================================================================
cdef enum:
    __GEOMETRY = 0
    __POINT = 1
    __CURVE = 2
    __LINESTRING = 3
    __SURFACE = 4
    __POLYGON = 5
    __COLLECTION = 6
    __MULTIPOINT = 7
    __MULTICURVE = 8
    __MULTILINESTRING = 9
    __MULTISURFACE = 10
    __MULTIPOLYGON = 11

cdef struct Coordinate:
    float x
    float y

#===============================================================================
# Par. 3.2.3 Geometry Types
#===============================================================================
# More or less conforming to "Type level 1", i.e. supported are:
# Point, LineString, Polygon,
#
# In the (near) future, we will _probably_ support:
# MultiPoint, MultiLineString, MultiPolygon
#===============================================================================
cdef class Geometry: 
    # Abstract class -> no instances
    cdef readonly unsigned int type
   
    def __init__(self): #, int srid):
        if self.__class__ is Geometry:
            raise NotImplementedError

    property geometry_type:
        def __get__(self):
            return geometry_type(self)

    property empty:
        def __get__(self):
            return is_empty(self)

    def __str__(self):
        return as_text(self)
   
    def __repr__(self):
        return as_text(self)

cdef class Point(Geometry):
    cdef Coordinate coordinate
      
    property x:
        def __get__(self):
            return self.coordinate.x
       
        def __set__(self, float value):
            self.coordinate.x = value
   
    property y:
        def __get__(self):
            return self.coordinate.y
       
        def __set__(self, float value):
            self.coordinate.y = value

    def __init__(self, float x, float y): #, int srid = -1):
        self.type = __POINT
        self.coordinate.x = x
        self.coordinate.y = y
        # self.srid = srid
   
    def __contains__(Point self, float item):
        return (item == self.coordinate.x or item == self.coordinate.y)
   
    def __richcmp__(Point self, Point other, unsigned int op):
        # Meaning of op < 0, == 2, > 4, <= 1, != 3, >= 5
        if op == 2: # ==
            if  type(other) == self.__class__  and self.x == other.x and 
self.y == other.y: # and self.srid == other.srid:
                return True
            else:
                return False
        if op == 3: # !=
            if  type(other) == self.__class__ and (self.x != other.x or 
self.y != other.y):# or self.srid != other.srid):
                return True
            else:
                return False
   
    def accept(Point self, Visitor visitor):
        return visitor.visitPoint(self)


cdef class Curve(Geometry):
    # Abstract class -> no instances
    def __init__(self): #, int srid):
        if self.__class__ is Curve:
            raise NotImplementedError

cdef class LineString(Curve):
    #cdef public int srid
    cdef object points
   
    def __init__(self): #, int srid = -1):
        self.type = __LINESTRING
        self.points = []
        # self.srid = srid

    def __richcmp__(LineString self, LineString other, unsigned int op):
        # Meaning of op < 0, == 2, > 4, <= 1, != 3, >= 5
        if op == 2: # ==
            if type(other) == self.__class__  and self.points == 
other.points:# and self.srid == other.srid:
                return True
            else:
                return False
        if op == 3: # !=
            if type(other) == self.__class__  and (self.points != 
other.points):# or self.srid != other.srid):
                return True
            else:
                return False
           
    def __setitem__(LineString self, unsigned int key, Point item):
        #assert item.__class__.__name__, "Point"
        self.points[key] = item

    def __getitem__(LineString self, unsigned int n):
        return self.points[n]
   
    def __delitem__(LineString self, unsigned int n):
        del self.points[n]
   
    def __len__(LineString self):
        return len(self.points)

    def __contains__(LineString self, Point item):
        return item in self.points

    def append(LineString self, Point item):
        #assert item.__class__.__name__, "Point"
        self.points.append(item)
   
    def accept(LineString self, Visitor visitor):
        return visitor.visitLineString(self)

cdef class Surface(Geometry): 
    # Abstract class -> no instances
    def __init__(self):
        if self.__class__ is Surface:
            raise NotImplementedError
   
cdef class Polygon(Surface):
    #cdef public int srid
    cdef object rings
   
    def __init__(Polygon self): #, int srid = -1):
        self.type = __POLYGON
        self.rings = []
#        self.srid = srid

    def __richcmp__(Polygon self, Polygon other, unsigned int op):
        # Meaning of op < 0, == 2, > 4, <= 1, != 3, >= 5
        if op == 2: # ==
            if type(other) == self.__class__ and self.rings == 
other.rings:# and self.srid == other.srid:
                return True
            else:
                return False
        if op == 3: # !=
            if type(other) == self.__class__ and (self.rings != 
other.rings):# or self.srid != other.srid):
                return True
            else:
                return False

    def __len__(self):
        return len(self.rings)

    def __setitem__(Polygon self, unsigned int n, LineString item):
        #assert item.__class__.__name__, "LineString"
        self.rings[n] = item

    def __getitem__(Polygon self, unsigned int n):
        return self.rings[n]
   
    def __contains__(Polygon self, LineString item):
        return item in self.rings
   
    def append(Polygon self, LineString item):
        #assert item.__class__.__name__, "LineString"
        self.rings.append(item)
   
    def accept(Polygon self, Visitor visitor):
        return visitor.visitPolygon(self)

##===============================================================================
## Par 3.2.6 Constructing a Geometry from Text (non-conforming)
##===============================================================================
#def geom_from_text(text, srid = -1):
#    pass
##===============================================================================
## Par 3.2.7 Constructing a Geometry from WKB (non-conforming)
##===============================================================================
#def geom_from_wkb(wkb, srid = -1):
#    pass

#===============================================================================
# Par 3.2.8 Obtaining WKT Representation
#===============================================================================
def as_text(Geometry geom):
    pass
#
##===============================================================================
## Par 3.2.9 Obtaining WKB representation
##===============================================================================
#def as_binary(Geometry geom):
#    pass
#
##===============================================================================
## Par 3.2.10 Functions on Geometry
##===============================================================================
#def dimension(Geometry geom):
#    pass

def geometry_type(Geometry geom):
    type_lookup= {0: "GEOMETRY",
                  1: "POINT",
                  2: "CURVE",
                  3: "LINESTRING",
                  4: "SURFACE",
                  5: "POLYGON",
                  6: "COLLECTION",
                  7: "MULTIPOINT",
                  8: "MULTICURVE",
                  9: "MULTILINESTRING",
                  10: "MULTISURFACE",
                  11: "MULTIPOLYGON",
                  }
    return type_lookup[geom.type]

# TODO: support spatial reference id, or not?
#def srid(Geometry geom): # implemented as property?
#    return geom.srid

def is_empty(Geometry geom):
    if geom.type == __POINT:
        return False # Point cannot be empty, at the moment
    elif geom.type == __LINESTRING:
        return num_points(geom) == 0
    elif geom.type == __POLYGON:
        return num_rings(geom) == 0

#===============================================================================
# Par 3.2.11 Functions on Type Point
#===============================================================================
def x(Point geom):
    return geom.coordinate.x

def y(Point geom):
    return geom.coordinate.y

##===============================================================================
## Par 3.2.13 LineString
##===============================================================================

def num_points(LineString geom):
    return len(geom.points)

##===============================================================================
## Non SFS spec operations
##===============================================================================
def num_rings(Polygon geom):
    return len(geom.rings)

def __rings(Polygon geom):
    return geom.rings

def __points(LineString geom):
    return geom.points

cdef class WKBReader:
    cdef readonly char *byte_mask
    cdef readonly char *uint32_mask
    cdef readonly char *double_mask
    cdef object stream
    cdef unsigned int start, to_read, wkb_byte_order

    def __init__(self):
        self.wkb_byte_order = 0
        # native byte order of platform
        self.byte_mask = '@B'

    def read_geometry(self, object wkb):
        self.set_wkb(wkb)
        return self._read_geom()

    cdef set_wkb(self, object wkb):
        self.stream = StringIO(wkb)
        self.start = 0
        self.to_read = 0
               
    cdef void inc_ct(self, unsigned int val):
        # absolute position to start
        self.stream.seek(self.start, 0)
        self.to_read = val
        # next absolute position to end
        self.start = self.to_read + self.start

    cdef Geometry _read_geom(self):
        cdef unsigned int geo_type
        self.inc_ct(1)
        self.wkb_byte_order = self.unpack_byte()
        if self.wkb_byte_order == 0:
            #xdr: big_endian
            self.byte_mask = '>B'
            self.uint32_mask = '>i'
            self.double_mask = '>d'
        elif self.wkb_byte_order == 1:
            #ndr: small_endian
            self.byte_mask = '<B'
            self.uint32_mask = '<i'
            self.double_mask = '<d'

        self.inc_ct(4)
        geo_type = self.unpack_uint32()
       
        if geo_type == 1: # Point
            return self._read_point()
           
        elif geo_type == 2: # LineString
            return self._read_linestring()
                           
        elif geo_type == 3: # Polygon
            return self._read_polygon()
           
        else:
            raise NotImplementedError, "Type encountered that is not 
implemented (yet)"

    cdef Point _read_point(self):
        cdef float x, y
        self.inc_ct(8)
        x = self.unpack_double()
        self.inc_ct(8)
        y = self.unpack_double()
        return Point(x, y)
   
    cdef LineString _read_linestring(self):
        cdef LineString line
        cdef unsigned int num_points, i
        cdef float x, y
       
        line = LineString()
        self.inc_ct(4)
        num_points = self.unpack_uint32()
        for i from 0 <= i < num_points:
            self.inc_ct(8)
            x = self.unpack_double()
            self.inc_ct(8)
            y = self.unpack_double()
            line.append(Point(x,y))
        return line
   
    cdef Polygon _read_polygon(self):
        cdef Polygon poly
        cdef LineString ring
        cdef unsigned int num_points, num_rings, n, i
        cdef float x, y
        poly = Polygon()
        self.inc_ct(4)
        num_rings = self.unpack_uint32()
        for n from 0 <= n < num_rings:
            ring = LineString()
            self.inc_ct(4)
            num_points = self.unpack_uint32()
            for i from 0 <= i < num_points:
                self.inc_ct(8)
                x = self.unpack_double()
                self.inc_ct(8)
                y = self.unpack_double()
                ring.append(Point(x,y))
            poly.append(ring)
        return poly
   
    cdef unsigned int unpack_byte(self):
        return unpack(self.byte_mask, self.stream.read(self.to_read))[0]
   
    cdef unsigned int unpack_uint32(self):
        return unpack(self.uint32_mask, self.stream.read(self.to_read))[0]
   
    cdef float unpack_double(self):
        return unpack(self.double_mask, self.stream.read(self.to_read))[0]

cdef class WKTVisitor(Visitor):
   
    def visit(WKTVisitor self, Geometry geom):
        return geom.accept(self)
   
    def visitPoint(WKTVisitor self, Point point):
        return "POINT(%s)" % self.astext_point(point)
   
    def visitLineString(WKTVisitor self, LineString linestring):
        return "LINESTRING(%s)" % self.astext_linestring(linestring)

    def visitPolygon(WKTVisitor self, Polygon polygon):
        return "POLYGON((%s))" % self.astext_polygon(polygon)

    cdef astext_point(WKTVisitor self, Point point):
        cdef float x, y
        x = point.x
        y = point.y
        return "%.3f %.3f" % (x,y)

    cdef astext_linestring(WKTVisitor self, LineString linestring):
        cdef object line_txt = []
        cdef Point point
        line_txt = [self.astext_point(point) for point in linestring.points]
        return ", ".join(line_txt)

    cdef astext_polygon(WKTVisitor self, Polygon polygon):
        cdef object ring_txt = []
        cdef LineString linearring
        ring_txt = [self.astext_linestring(linearring) for linearring in 
polygon.rings]
        return "), (".join(ring_txt)

cdef class WKBVisitor(Visitor):
    cdef readonly char *byte_mask
    cdef readonly char *uint32_mask
    cdef readonly char *double_mask
    cdef object stream
    cdef readonly unsigned int wkb_byte_order

    def __cinit__(self):
        self.stream = []
        cdef bint big_endian_env
        # find native byte order of platform
        if pack("h", 1) == "\000\001": #xdr
            big_endian_env = True
        else: #ndr
            big_endian_env = False
        # and use this to encode geom to binary
        # (wkb format is self-describing by first byte)
        if big_endian_env:
            self.wkb_byte_order = 0 # wkbXDR = 0
        else:
            self.wkb_byte_order = 1 # wkbNDR = 1
        self.byte_mask = '@B'
        self.uint32_mask = '@i'
        self.double_mask = '@d'

    def visit(WKBVisitor self, Geometry geom):
        return geom.accept(self)
   
    def visitPoint(WKBVisitor self, Point point):
        cdef object wkb
        self.pack_byte_order()
        self.pack_geom_type(1)
        self.pack_coordinate(point.x, point.y)
        wkb = self.stream[:]
        self.stream = []
        return ''.join(wkb)
   
    def visitLineString(WKBVisitor self, LineString linestring):
        cdef object wkb
        cdef unsigned int nr_points, n
        self.pack_byte_order()
        self.pack_geom_type(2)
        nr_points = len(linestring)
        self.pack_nr(nr_points)
        for n from 0 <= n < nr_points:
            self.pack_coordinate(linestring[n].x, linestring[n].y)
        wkb = self.stream[:]
        self.stream = []
        return ''.join(wkb)

    def visitPolygon(WKBVisitor self, Polygon polygon):
        cdef object wkb
        cdef unsigned int nr_rings, nr_points, n, r
        self.pack_byte_order()
        self.pack_geom_type(3)
        nr_rings = len(polygon)
        self.pack_nr(nr_rings)
        for r from 0 <= r < nr_rings:
            nr_points = len(polygon[r])
            self.pack_nr(nr_points)
            for n from 0 <= n < nr_points:
                self.pack_coordinate(polygon[r][n].x, polygon[r][n].y)
        wkb = self.stream[:]
        self.stream = []
        return ''.join(wkb)

    # higher level "packers"
    cdef pack_byte_order(self):
        self.stream.append( self._pack_byte(self.wkb_byte_order) )

    cdef pack_geom_type(self, unsigned int type):
        self.stream.append( self._pack_uint32(type) )

    cdef pack_coordinate(self, float x, float y):
        self.stream.append( self._pack_double(x) )
        self.stream.append( self._pack_double(y) )
   
    cdef pack_nr(self, unsigned int nr):
        self.stream.append( self._pack_uint32(nr) )
   
    # low level "struct packers"
    cdef _pack_double(self, float data):
        return pack(self.double_mask, data)
   
    cdef _pack_uint32(self, unsigned int data):
        return pack(self.uint32_mask, data)
   
    cdef _pack_byte(self, unsigned int data):
        return pack(self.byte_mask, data)


# ------------------------------------------------------
# api_test.py

def api():
    pt = Point(10, 1)
    visitWKT = WKTVisitor()
    visitWKB = WKBVisitor()
    unserialize = WKBReader()
    wkb = visitWKB.visit(pt)

    start_time = time()
    for i in xrange( int(1e4) ):
        wkt = visitWKT.visit(pt)
    print_timing_info(time() - start_time, 1e4, "WKT visitor, point")
   
    start_time = time()
    for i in xrange( int(1e4) ):
        wkb = visitWKB.visit(pt)
    print_timing_info(time() - start_time, 1e4, "WKB visitor, point")

    ln = LineString()
    for j in xrange( int(60) ):
        pt = Point(999.325,j)
        ln.append(pt)
    ln[0] = Point(25, 1)

    start_time = time()
    for i in xrange( int(1e4) ):
        wkt = visitWKT.visit(ln)
    print_timing_info(time() - start_time, 1e4, "WKT visitor, line")

    start_time = time()
    for i in xrange( int(1e4) ):
        wkb = visitWKB.visit(ln)
    print_timing_info(time() - start_time, 1e4, "WKB visitor, line")







More information about the Pyrex mailing list