[Pyrex] faster in / output from objects [long post + code!]
Martijn Meijers
w3-pyrex at zw9.nl
Thu Jan 24 13:24:25 CET 2008
Dear list members,
Currently I'm working in the geo-informatics field and I'm doing
research on storage of vector data in a DBMS. For this my programming
language of choice is Python. . Although there are some vector libraries
in C with Python bindings, I feel that those are not really comfortable
to work with (due to their API).Therefore, I decided to roll my own
library for educational and research purposes and I'm using Cython for
this purpose (as I'm not really proficient in C or C++, and I'm not
really willing to go that route, as it involves quite a steep learning
curve).
Below, you'll find my library that I created. Creation of objects is
fairly fast, compared with the C-lib-with-python bindings that I have
available for comparison (my approach is around 1.5 times faster with
object creation). However, I'm stuck with in/output of my objects: Two
formats I'd like to support: a text based format and a binary format.
Here, I have the feeling I don't understand how I can use Cython to push
the throughput to the limits. My approach (with Visitors) is fairly
slow. As I understand it, Cython is more geared towards (mathematical)
computations, then to text processing...
I'd like to know some things about my code:
(a) Did I do things the right way, or can the code be optimized more
(while staying in Cython)?
(b) Is it possible to speed up the in- and output of text and binary
formats (here a lot of python functions are still used, but I can't seem
to find examples of how to do text/binary stream processing with Cython)...?
Thanks very much for your time and advice in advance!
Kind regards,
Martijn Meijers
Delft University of Technology, The Netherlands
OTB, Section GIS-technology
# ------------------------------------------------------
# lwgeom2d.pyx
#
# Light-Weight Geometry 2D for Python (with Cython)
# Martijn Meijers
# Init date: 18 January 2008
#
from cStringIO import StringIO
from struct import pack, unpack
#===============================================================================
# We need this visitor class here, so we can reference it in geometries
# TODO: split out in separate, logical, packages
#===============================================================================
cdef class Visitor:
def __init__(self):
if self.__class__ is Visitor:
raise NotImplementedError
def visit(self, item):
raise NotImplementedError
#===============================================================================
# Geometry types defined in Simple Feature Specification
#===============================================================================
cdef enum:
__GEOMETRY = 0
__POINT = 1
__CURVE = 2
__LINESTRING = 3
__SURFACE = 4
__POLYGON = 5
__COLLECTION = 6
__MULTIPOINT = 7
__MULTICURVE = 8
__MULTILINESTRING = 9
__MULTISURFACE = 10
__MULTIPOLYGON = 11
cdef struct Coordinate:
float x
float y
#===============================================================================
# Par. 3.2.3 Geometry Types
#===============================================================================
# More or less conforming to "Type level 1", i.e. supported are:
# Point, LineString, Polygon,
#
# In the (near) future, we will _probably_ support:
# MultiPoint, MultiLineString, MultiPolygon
#===============================================================================
cdef class Geometry:
# Abstract class -> no instances
cdef readonly unsigned int type
def __init__(self): #, int srid):
if self.__class__ is Geometry:
raise NotImplementedError
property geometry_type:
def __get__(self):
return geometry_type(self)
property empty:
def __get__(self):
return is_empty(self)
def __str__(self):
return as_text(self)
def __repr__(self):
return as_text(self)
cdef class Point(Geometry):
cdef Coordinate coordinate
property x:
def __get__(self):
return self.coordinate.x
def __set__(self, float value):
self.coordinate.x = value
property y:
def __get__(self):
return self.coordinate.y
def __set__(self, float value):
self.coordinate.y = value
def __init__(self, float x, float y): #, int srid = -1):
self.type = __POINT
self.coordinate.x = x
self.coordinate.y = y
# self.srid = srid
def __contains__(Point self, float item):
return (item == self.coordinate.x or item == self.coordinate.y)
def __richcmp__(Point self, Point other, unsigned int op):
# Meaning of op < 0, == 2, > 4, <= 1, != 3, >= 5
if op == 2: # ==
if type(other) == self.__class__ and self.x == other.x and
self.y == other.y: # and self.srid == other.srid:
return True
else:
return False
if op == 3: # !=
if type(other) == self.__class__ and (self.x != other.x or
self.y != other.y):# or self.srid != other.srid):
return True
else:
return False
def accept(Point self, Visitor visitor):
return visitor.visitPoint(self)
cdef class Curve(Geometry):
# Abstract class -> no instances
def __init__(self): #, int srid):
if self.__class__ is Curve:
raise NotImplementedError
cdef class LineString(Curve):
#cdef public int srid
cdef object points
def __init__(self): #, int srid = -1):
self.type = __LINESTRING
self.points = []
# self.srid = srid
def __richcmp__(LineString self, LineString other, unsigned int op):
# Meaning of op < 0, == 2, > 4, <= 1, != 3, >= 5
if op == 2: # ==
if type(other) == self.__class__ and self.points ==
other.points:# and self.srid == other.srid:
return True
else:
return False
if op == 3: # !=
if type(other) == self.__class__ and (self.points !=
other.points):# or self.srid != other.srid):
return True
else:
return False
def __setitem__(LineString self, unsigned int key, Point item):
#assert item.__class__.__name__, "Point"
self.points[key] = item
def __getitem__(LineString self, unsigned int n):
return self.points[n]
def __delitem__(LineString self, unsigned int n):
del self.points[n]
def __len__(LineString self):
return len(self.points)
def __contains__(LineString self, Point item):
return item in self.points
def append(LineString self, Point item):
#assert item.__class__.__name__, "Point"
self.points.append(item)
def accept(LineString self, Visitor visitor):
return visitor.visitLineString(self)
cdef class Surface(Geometry):
# Abstract class -> no instances
def __init__(self):
if self.__class__ is Surface:
raise NotImplementedError
cdef class Polygon(Surface):
#cdef public int srid
cdef object rings
def __init__(Polygon self): #, int srid = -1):
self.type = __POLYGON
self.rings = []
# self.srid = srid
def __richcmp__(Polygon self, Polygon other, unsigned int op):
# Meaning of op < 0, == 2, > 4, <= 1, != 3, >= 5
if op == 2: # ==
if type(other) == self.__class__ and self.rings ==
other.rings:# and self.srid == other.srid:
return True
else:
return False
if op == 3: # !=
if type(other) == self.__class__ and (self.rings !=
other.rings):# or self.srid != other.srid):
return True
else:
return False
def __len__(self):
return len(self.rings)
def __setitem__(Polygon self, unsigned int n, LineString item):
#assert item.__class__.__name__, "LineString"
self.rings[n] = item
def __getitem__(Polygon self, unsigned int n):
return self.rings[n]
def __contains__(Polygon self, LineString item):
return item in self.rings
def append(Polygon self, LineString item):
#assert item.__class__.__name__, "LineString"
self.rings.append(item)
def accept(Polygon self, Visitor visitor):
return visitor.visitPolygon(self)
##===============================================================================
## Par 3.2.6 Constructing a Geometry from Text (non-conforming)
##===============================================================================
#def geom_from_text(text, srid = -1):
# pass
##===============================================================================
## Par 3.2.7 Constructing a Geometry from WKB (non-conforming)
##===============================================================================
#def geom_from_wkb(wkb, srid = -1):
# pass
#===============================================================================
# Par 3.2.8 Obtaining WKT Representation
#===============================================================================
def as_text(Geometry geom):
pass
#
##===============================================================================
## Par 3.2.9 Obtaining WKB representation
##===============================================================================
#def as_binary(Geometry geom):
# pass
#
##===============================================================================
## Par 3.2.10 Functions on Geometry
##===============================================================================
#def dimension(Geometry geom):
# pass
def geometry_type(Geometry geom):
type_lookup= {0: "GEOMETRY",
1: "POINT",
2: "CURVE",
3: "LINESTRING",
4: "SURFACE",
5: "POLYGON",
6: "COLLECTION",
7: "MULTIPOINT",
8: "MULTICURVE",
9: "MULTILINESTRING",
10: "MULTISURFACE",
11: "MULTIPOLYGON",
}
return type_lookup[geom.type]
# TODO: support spatial reference id, or not?
#def srid(Geometry geom): # implemented as property?
# return geom.srid
def is_empty(Geometry geom):
if geom.type == __POINT:
return False # Point cannot be empty, at the moment
elif geom.type == __LINESTRING:
return num_points(geom) == 0
elif geom.type == __POLYGON:
return num_rings(geom) == 0
#===============================================================================
# Par 3.2.11 Functions on Type Point
#===============================================================================
def x(Point geom):
return geom.coordinate.x
def y(Point geom):
return geom.coordinate.y
##===============================================================================
## Par 3.2.13 LineString
##===============================================================================
def num_points(LineString geom):
return len(geom.points)
##===============================================================================
## Non SFS spec operations
##===============================================================================
def num_rings(Polygon geom):
return len(geom.rings)
def __rings(Polygon geom):
return geom.rings
def __points(LineString geom):
return geom.points
cdef class WKBReader:
cdef readonly char *byte_mask
cdef readonly char *uint32_mask
cdef readonly char *double_mask
cdef object stream
cdef unsigned int start, to_read, wkb_byte_order
def __init__(self):
self.wkb_byte_order = 0
# native byte order of platform
self.byte_mask = '@B'
def read_geometry(self, object wkb):
self.set_wkb(wkb)
return self._read_geom()
cdef set_wkb(self, object wkb):
self.stream = StringIO(wkb)
self.start = 0
self.to_read = 0
cdef void inc_ct(self, unsigned int val):
# absolute position to start
self.stream.seek(self.start, 0)
self.to_read = val
# next absolute position to end
self.start = self.to_read + self.start
cdef Geometry _read_geom(self):
cdef unsigned int geo_type
self.inc_ct(1)
self.wkb_byte_order = self.unpack_byte()
if self.wkb_byte_order == 0:
#xdr: big_endian
self.byte_mask = '>B'
self.uint32_mask = '>i'
self.double_mask = '>d'
elif self.wkb_byte_order == 1:
#ndr: small_endian
self.byte_mask = '<B'
self.uint32_mask = '<i'
self.double_mask = '<d'
self.inc_ct(4)
geo_type = self.unpack_uint32()
if geo_type == 1: # Point
return self._read_point()
elif geo_type == 2: # LineString
return self._read_linestring()
elif geo_type == 3: # Polygon
return self._read_polygon()
else:
raise NotImplementedError, "Type encountered that is not
implemented (yet)"
cdef Point _read_point(self):
cdef float x, y
self.inc_ct(8)
x = self.unpack_double()
self.inc_ct(8)
y = self.unpack_double()
return Point(x, y)
cdef LineString _read_linestring(self):
cdef LineString line
cdef unsigned int num_points, i
cdef float x, y
line = LineString()
self.inc_ct(4)
num_points = self.unpack_uint32()
for i from 0 <= i < num_points:
self.inc_ct(8)
x = self.unpack_double()
self.inc_ct(8)
y = self.unpack_double()
line.append(Point(x,y))
return line
cdef Polygon _read_polygon(self):
cdef Polygon poly
cdef LineString ring
cdef unsigned int num_points, num_rings, n, i
cdef float x, y
poly = Polygon()
self.inc_ct(4)
num_rings = self.unpack_uint32()
for n from 0 <= n < num_rings:
ring = LineString()
self.inc_ct(4)
num_points = self.unpack_uint32()
for i from 0 <= i < num_points:
self.inc_ct(8)
x = self.unpack_double()
self.inc_ct(8)
y = self.unpack_double()
ring.append(Point(x,y))
poly.append(ring)
return poly
cdef unsigned int unpack_byte(self):
return unpack(self.byte_mask, self.stream.read(self.to_read))[0]
cdef unsigned int unpack_uint32(self):
return unpack(self.uint32_mask, self.stream.read(self.to_read))[0]
cdef float unpack_double(self):
return unpack(self.double_mask, self.stream.read(self.to_read))[0]
cdef class WKTVisitor(Visitor):
def visit(WKTVisitor self, Geometry geom):
return geom.accept(self)
def visitPoint(WKTVisitor self, Point point):
return "POINT(%s)" % self.astext_point(point)
def visitLineString(WKTVisitor self, LineString linestring):
return "LINESTRING(%s)" % self.astext_linestring(linestring)
def visitPolygon(WKTVisitor self, Polygon polygon):
return "POLYGON((%s))" % self.astext_polygon(polygon)
cdef astext_point(WKTVisitor self, Point point):
cdef float x, y
x = point.x
y = point.y
return "%.3f %.3f" % (x,y)
cdef astext_linestring(WKTVisitor self, LineString linestring):
cdef object line_txt = []
cdef Point point
line_txt = [self.astext_point(point) for point in linestring.points]
return ", ".join(line_txt)
cdef astext_polygon(WKTVisitor self, Polygon polygon):
cdef object ring_txt = []
cdef LineString linearring
ring_txt = [self.astext_linestring(linearring) for linearring in
polygon.rings]
return "), (".join(ring_txt)
cdef class WKBVisitor(Visitor):
cdef readonly char *byte_mask
cdef readonly char *uint32_mask
cdef readonly char *double_mask
cdef object stream
cdef readonly unsigned int wkb_byte_order
def __cinit__(self):
self.stream = []
cdef bint big_endian_env
# find native byte order of platform
if pack("h", 1) == "\000\001": #xdr
big_endian_env = True
else: #ndr
big_endian_env = False
# and use this to encode geom to binary
# (wkb format is self-describing by first byte)
if big_endian_env:
self.wkb_byte_order = 0 # wkbXDR = 0
else:
self.wkb_byte_order = 1 # wkbNDR = 1
self.byte_mask = '@B'
self.uint32_mask = '@i'
self.double_mask = '@d'
def visit(WKBVisitor self, Geometry geom):
return geom.accept(self)
def visitPoint(WKBVisitor self, Point point):
cdef object wkb
self.pack_byte_order()
self.pack_geom_type(1)
self.pack_coordinate(point.x, point.y)
wkb = self.stream[:]
self.stream = []
return ''.join(wkb)
def visitLineString(WKBVisitor self, LineString linestring):
cdef object wkb
cdef unsigned int nr_points, n
self.pack_byte_order()
self.pack_geom_type(2)
nr_points = len(linestring)
self.pack_nr(nr_points)
for n from 0 <= n < nr_points:
self.pack_coordinate(linestring[n].x, linestring[n].y)
wkb = self.stream[:]
self.stream = []
return ''.join(wkb)
def visitPolygon(WKBVisitor self, Polygon polygon):
cdef object wkb
cdef unsigned int nr_rings, nr_points, n, r
self.pack_byte_order()
self.pack_geom_type(3)
nr_rings = len(polygon)
self.pack_nr(nr_rings)
for r from 0 <= r < nr_rings:
nr_points = len(polygon[r])
self.pack_nr(nr_points)
for n from 0 <= n < nr_points:
self.pack_coordinate(polygon[r][n].x, polygon[r][n].y)
wkb = self.stream[:]
self.stream = []
return ''.join(wkb)
# higher level "packers"
cdef pack_byte_order(self):
self.stream.append( self._pack_byte(self.wkb_byte_order) )
cdef pack_geom_type(self, unsigned int type):
self.stream.append( self._pack_uint32(type) )
cdef pack_coordinate(self, float x, float y):
self.stream.append( self._pack_double(x) )
self.stream.append( self._pack_double(y) )
cdef pack_nr(self, unsigned int nr):
self.stream.append( self._pack_uint32(nr) )
# low level "struct packers"
cdef _pack_double(self, float data):
return pack(self.double_mask, data)
cdef _pack_uint32(self, unsigned int data):
return pack(self.uint32_mask, data)
cdef _pack_byte(self, unsigned int data):
return pack(self.byte_mask, data)
# ------------------------------------------------------
# api_test.py
def api():
pt = Point(10, 1)
visitWKT = WKTVisitor()
visitWKB = WKBVisitor()
unserialize = WKBReader()
wkb = visitWKB.visit(pt)
start_time = time()
for i in xrange( int(1e4) ):
wkt = visitWKT.visit(pt)
print_timing_info(time() - start_time, 1e4, "WKT visitor, point")
start_time = time()
for i in xrange( int(1e4) ):
wkb = visitWKB.visit(pt)
print_timing_info(time() - start_time, 1e4, "WKB visitor, point")
ln = LineString()
for j in xrange( int(60) ):
pt = Point(999.325,j)
ln.append(pt)
ln[0] = Point(25, 1)
start_time = time()
for i in xrange( int(1e4) ):
wkt = visitWKT.visit(ln)
print_timing_info(time() - start_time, 1e4, "WKT visitor, line")
start_time = time()
for i in xrange( int(1e4) ):
wkb = visitWKB.visit(ln)
print_timing_info(time() - start_time, 1e4, "WKB visitor, line")
More information about the Pyrex
mailing list