Package pyxmpp :: Module xmlextra
[hide private]

Source Code for Module pyxmpp.xmlextra

  1  # 
  2  # (C) Copyright 2003-2006 Jacek Konieczny <jajcus@jajcus.net> 
  3  # 
  4  # This program is free software; you can redistribute it and/or modify 
  5  # it under the terms of the GNU Lesser General Public License Version 
  6  # 2.1 as published by the Free Software Foundation. 
  7  # 
  8  # This program is distributed in the hope that it will be useful, 
  9  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 10  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 11  # GNU Lesser General Public License for more details. 
 12  # 
 13  # You should have received a copy of the GNU Lesser General Public 
 14  # License along with this program; if not, write to the Free Software 
 15  # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 
 16  # 
 17  # pylint: disable-msg=C0103, W0132, W0611 
 18   
 19  """Extension to libxml2 for XMPP stream and stanza processing""" 
 20   
 21  __revision__="$Id: xmlextra.py,v 1.15 2004/10/11 18:33:51 jajcus Exp $" 
 22  __docformat__="restructuredtext en" 
 23   
 24  import sys 
 25  import libxml2 
 26  import threading 
 27  import re 
 28   
 29  from pyxmpp.exceptions import StreamParseError 
 30   
 31  common_doc = libxml2.newDoc("1.0") 
 32  common_root = common_doc.newChild(None,"root",None) 
 33  COMMON_NS = "http://pyxmpp.jajcus.net/xmlns/common" 
 34  common_ns = common_root.newNs(COMMON_NS, None) 
 35  common_root.setNs(common_ns) 
 36  common_doc.setRootElement(common_root) 
 37   
38 -class StreamHandler:
39 """Base class for stream handler."""
40 - def __init__(self):
41 pass
42
43 - def _stream_start(self,_doc):
44 """Process stream start.""" 45 doc=libxml2.xmlDoc(_doc) 46 self.stream_start(doc)
47
48 - def _stream_end(self,_doc):
49 """Process stream end.""" 50 doc=libxml2.xmlDoc(_doc) 51 self.stream_end(doc)
52
53 - def _stanza(self,_doc,_node):
54 """Process complete stanza.""" 55 doc=libxml2.xmlDoc(_doc) 56 node=libxml2.xmlNode(_node) 57 self.stanza(doc,node)
58
59 - def stream_start(self,doc):
60 """Called when the start tag of root element is encountered 61 in the stream. 62 63 :Parameters: 64 - `doc`: the document being parsed. 65 :Types: 66 - `doc`: `libxml2.xmlDoc`""" 67 print >>sys.stderr,"Unhandled stream start:",`doc.serialize()`
68
69 - def stream_end(self,doc):
70 """Called when the end tag of root element is encountered 71 in the stream. 72 73 :Parameters: 74 - `doc`: the document being parsed. 75 :Types: 76 - `doc`: `libxml2.xmlDoc`""" 77 print >>sys.stderr,"Unhandled stream end",`doc.serialize()`
78
79 - def stanza(self, _unused, node):
80 """Called when the end tag of a direct child of the root 81 element is encountered in the stream. 82 83 Please note, that node will be removed from the document 84 and freed after this method returns. If it is needed after 85 that a copy must be made before the method returns. 86 87 :Parameters: 88 - `_unused`: the document being parsed. 89 - `node`: the (complete) element being processed 90 :Types: 91 - `_unused`: `libxml2.xmlDoc` 92 - `node`: `libxml2.xmlNode`""" 93 print >>sys.stderr,"Unhandled stanza",`node.serialize()`
94
95 - def error(self,descr):
96 """Called when an error is encountered in the stream. 97 98 :Parameters: 99 - `descr`: description of the error 100 :Types: 101 - `descr`: `str`""" 102 raise StreamParseError,descr
103 104 try: 105 ######################################################################### 106 # C-extension based workarounds for libxml2 limitations 107 #------------------------------------------------------- 108 from pyxmpp import _xmlextra 109 from pyxmpp._xmlextra import error 110 111 _create_reader = _xmlextra.sax_reader_new 112
113 - def replace_ns(node, old_ns,new_ns):
114 """Replace namespaces in a whole subtree. 115 116 The old namespace declaration will be removed if present on the `node`. 117 118 :Parameters: 119 - `node`: the root of the subtree where namespaces should be replaced. 120 - `old_ns`: the namespace to replace. 121 - `new_ns`: the namespace to be used instead of old_ns. 122 :Types: 123 - `node`: `libxml2.xmlNode` 124 - `old_ns`: `libxml2.xmlNs` 125 - `new_ns`: `libxml2.xmlNs` 126 127 Both old_ns and new_ns may be None meaning no namespace set.""" 128 if old_ns is None: 129 old_ns__o = None 130 else: 131 old_ns__o = old_ns._o 132 if new_ns is None: 133 new_ns__o = None 134 else: 135 new_ns__o = new_ns._o 136 if node is None: 137 node__o = None 138 else: 139 node__o = node._o 140 _xmlextra.replace_ns(node__o, old_ns__o, new_ns__o) 141 if old_ns__o: 142 _xmlextra.remove_ns(node__o, old_ns__o)
143 144 pure_python = False 145 146 except ImportError: 147 ######################################################################### 148 # Pure python implementation (slow workarounds for libxml2 limitations) 149 #-----------------------------------------------------------------------
150 - class error(Exception):
151 """Exception raised on a stream parse error.""" 152 pass
153
154 - def _escape(data):
155 """Escape data for XML""" 156 data=data.replace("&","&amp;") 157 data=data.replace("<","&lt;") 158 data=data.replace(">","&gt;") 159 data=data.replace("'","&apos;") 160 data=data.replace('"',"&quot;") 161 return data
162
163 - class _SAXCallback(libxml2.SAXCallback):
164 """SAX events handler for the python-only stream parser."""
165 - def __init__(self, handler):
166 """Initialize the SAX handler. 167 168 :Parameters: 169 - `handler`: Object to handle stream start, end and stanzas. 170 :Types: 171 - `handler`: `StreamHandler` 172 """ 173 self._handler = handler 174 self._head = "" 175 self._tail = "" 176 self._current = "" 177 self._level = 0 178 self._doc = None 179 self._root = None
180
181 - def cdataBlock(self, data):
182 "" 183 if self._level>1: 184 self._current += _escape(data)
185
186 - def characters(self, data):
187 "" 188 if self._level>1: 189 self._current += _escape(data)
190
191 - def comment(self, content):
192 "" 193 pass
194
195 - def endDocument(self):
196 "" 197 pass
198
199 - def endElement(self, tag):
200 "" 201 self._current+="</%s>" % (tag,) 202 self._level -= 1 203 if self._level > 1: 204 return 205 if self._level==1: 206 xml=self._head+self._current+self._tail 207 doc=libxml2.parseDoc(xml) 208 try: 209 node = doc.getRootElement().children 210 try: 211 node1 = node.docCopyNode(self._doc, 1) 212 try: 213 self._root.addChild(node1) 214 self._handler.stanza(self._doc, node1) 215 except: 216 node1.unlinkNode() 217 node1.freeNode() 218 del node1 219 finally: 220 del node 221 finally: 222 doc.freeDoc() 223 else: 224 xml=self._head+self._tail 225 doc=libxml2.parseDoc(xml) 226 try: 227 self._handler.stream_end(self._doc) 228 self._doc.freeDoc() 229 self._doc = None 230 self._root = None 231 finally: 232 doc.freeDoc()
233
234 - def error(self, msg):
235 "" 236 self._handler.error(msg)
237 238 fatalError = error 239 240 ignorableWhitespace = characters 241
242 - def reference(self, name):
243 "" 244 self._current += "&" + name + ";"
245
246 - def startDocument(self):
247 "" 248 pass
249
250 - def startElement(self, tag, attrs):
251 "" 252 s = "<"+tag 253 if attrs: 254 for a,v in attrs.items(): 255 s+=" %s='%s'" % (a,_escape(v)) 256 s += ">" 257 if self._level == 0: 258 self._head = s 259 self._tail = "</%s>" % (tag,) 260 xml=self._head+self._tail 261 self._doc = libxml2.parseDoc(xml) 262 self._handler.stream_start(self._doc) 263 self._root = self._doc.getRootElement() 264 elif self._level == 1: 265 self._current = s 266 else: 267 self._current += s 268 self._level += 1
269
270 - def warning(self):
271 "" 272 pass
273
274 - class _PythonReader:
275 """Python-only stream reader."""
276 - def __init__(self,handler):
277 """Initialize the reader. 278 279 :Parameters: 280 - `handler`: Object to handle stream start, end and stanzas. 281 :Types: 282 - `handler`: `StreamHandler` 283 """ 284 self.handler = handler 285 self.sax = _SAXCallback(handler) 286 self.parser = libxml2.createPushParser(self.sax, '', 0, 'stream')
287
288 - def feed(self, data):
289 """Feed the parser with a chunk of data. Apropriate methods 290 of `self.handler` will be called whenever something interesting is 291 found. 292 293 :Parameters: 294 - `data`: the chunk of data to parse. 295 :Types: 296 - `data`: `str`""" 297 return self.parser.parseChunk(data, len(data), 0)
298 299 _create_reader = _PythonReader 300
301 - def _get_ns(node):
302 """Get namespace of node. 303 304 :return: the namespace object or `None` if the node has no namespace 305 assigned. 306 :returntype: `libxml2.xmlNs`""" 307 try: 308 return node.ns() 309 except libxml2.treeError: 310 return None
311
312 - def replace_ns(node, old_ns, new_ns):
313 """Replace namespaces in a whole subtree. 314 315 :Parameters: 316 - `node`: the root of the subtree where namespaces should be replaced. 317 - `old_ns`: the namespace to replace. 318 - `new_ns`: the namespace to be used instead of old_ns. 319 :Types: 320 - `node`: `libxml2.xmlNode` 321 - `old_ns`: `libxml2.xmlNs` 322 - `new_ns`: `libxml2.xmlNs` 323 324 Both old_ns and new_ns may be None meaning no namespace set.""" 325 326 if old_ns is not None: 327 old_ns_uri = old_ns.content 328 old_ns_prefix = old_ns.name 329 else: 330 old_ns_uri = None 331 old_ns_prefix = None 332 333 ns = _get_ns(node) 334 if ns is None and old_ns is None: 335 node.setNs(new_ns) 336 elif ns and ns.content == old_ns_uri and ns.name == old_ns_prefix: 337 node.setNs(new_ns) 338 339 p = node.properties 340 while p: 341 ns = _get_ns(p) 342 if ns is None and old_ns is None: 343 p.setNs(new_ns) 344 if ns and ns.content == old_ns_uri and ns.name == old_ns_prefix: 345 p.setNs(new_ns) 346 p = p.next 347 348 n = node.children 349 while n: 350 if n.type == 'element': 351 skip_element = False 352 try: 353 nsd = n.nsDefs() 354 except libxml2.treeError: 355 nsd = None 356 while nsd: 357 if nsd.name == old_ns_prefix: 358 skip_element = True 359 break 360 nsd = nsd.next 361 if not skip_element: 362 replace_ns(n, old_ns, new_ns) 363 n = n.next
364 365 pure_python = True 366 367 ########################################################### 368 # Common code 369 #------------- 370
371 -def get_node_ns(xmlnode):
372 """Namespace of an XML node. 373 374 :Parameters: 375 - `xmlnode`: the XML node to query. 376 :Types: 377 - `xmlnode`: `libxml2.xmlNode` 378 379 :return: namespace of the node or `None` 380 :returntype: `libxml2.xmlNs`""" 381 try: 382 return xmlnode.ns() 383 except libxml2.treeError: 384 return None
385
386 -def get_node_ns_uri(xmlnode):
387 """Return namespace URI of an XML node. 388 389 :Parameters: 390 - `xmlnode`: the XML node to query. 391 :Types: 392 - `xmlnode`: `libxml2.xmlNode` 393 394 :return: namespace URI of the node or `None` 395 :returntype: `unicode`""" 396 ns=get_node_ns(xmlnode) 397 if ns: 398 return unicode(ns.getContent(),"utf-8") 399 else: 400 return None
401
402 -def xml_node_iter(nodelist):
403 """Iterate over sibling XML nodes. All types of nodes will be returned 404 (not only the elements). 405 406 Usually used to iterade over node's children like this:: 407 408 xml_node_iter(node.children) 409 410 :Parameters: 411 - `nodelist`: start node of the list. 412 :Types: 413 - `nodelist`: `libxml2.xmlNode` 414 """ 415 node = nodelist 416 while node: 417 yield node 418 node = node.next
419
420 -def xml_element_iter(nodelist):
421 """Iterate over sibling XML elements. Non-element nodes will be skipped. 422 423 Usually used to iterade over node's children like this:: 424 425 xml_node_iter(node.children) 426 427 :Parameters: 428 - `nodelist`: start node of the list. 429 :Types: 430 - `nodelist`: `libxml2.xmlNode` 431 """ 432 node = nodelist 433 while node: 434 if node.type == "element": 435 yield node 436 node = node.next
437
438 -def xml_element_ns_iter(nodelist, ns_uri):
439 """Iterate over sibling XML elements. Only elements in the given namespace will be returned. 440 441 Usually used to iterade over node's children like this:: 442 443 xml_node_iter(node.children) 444 445 :Parameters: 446 - `nodelist`: start node of the list. 447 :Types: 448 - `nodelist`: `libxml2.xmlNode` 449 """ 450 node = nodelist 451 while node: 452 if node.type == "element" and get_node_ns_uri(node)==ns_uri: 453 yield node 454 node = node.next
455 456 evil_characters_re=re.compile(r"[\000-\010\013\014\016-\037]",re.UNICODE) 457 utf8_replacement_char=u"\ufffd".encode("utf-8") 458
459 -def remove_evil_characters(s):
460 """Remove control characters (not allowed in XML) from a string.""" 461 if isinstance(s,unicode): 462 return evil_characters_re.sub(u"\ufffd",s) 463 else: 464 return evil_characters_re.sub(utf8_replacement_char,s)
465 466 bad_nsdef_replace_re=re.compile(r"^([^<]*\<[^><]*\s+)(xmlns=((\"[^\"]*\")|(\'[^\']*\')))") 467
468 -def safe_serialize(xmlnode):
469 """Serialize an XML element making sure the result is sane. 470 471 Remove control characters and invalid namespace declarations from the 472 result string. 473 474 :Parameters: 475 - `xmlnode`: the XML element to serialize. 476 :Types: 477 - `xmlnode`: `libxml2.xmlNode` 478 479 :return: UTF-8 encoded serialized and sanitized element. 480 :returntype: `string`""" 481 try: 482 ns = xmlnode.ns() 483 except libxml2.treeError: 484 ns = None 485 try: 486 nsdef = xmlnode.nsDefs() 487 except libxml2.treeError: 488 nsdef = None 489 s=xmlnode.serialize(encoding="UTF-8") 490 while nsdef: 491 if nsdef.name is None and (not ns or (nsdef.name, nsdef.content)!=(ns.name, ns.content)): 492 s = bad_nsdef_replace_re.sub("\\1",s,1) 493 break 494 nsdef = nsdef.next 495 s=remove_evil_characters(s) 496 return s
497
498 -class StreamReader:
499 """A simple push-parser interface for XML streams."""
500 - def __init__(self,handler):
501 """Initialize `StreamReader` object. 502 503 :Parameters: 504 - `handler`: handler object for the stream content 505 :Types: 506 - `handler`: `StreamHandler` derived class 507 """ 508 self.reader=_create_reader(handler) 509 self.lock=threading.RLock() 510 self.in_use=0
511 - def doc(self):
512 """Get the document being parsed. 513 514 :return: the document. 515 :returntype: `libxml2.xmlNode`""" 516 ret=self.reader.doc() 517 if ret: 518 return libxml2.xmlDoc(ret) 519 else: 520 return None
521 - def feed(self,s):
522 """Pass a string to the stream parser. 523 524 Parameters: 525 - `s`: string to parse. 526 Types: 527 - `s`: `str` 528 529 :return: `None` on EOF, `False` when whole input was parsed and `True` 530 if there is something still left in the buffer.""" 531 self.lock.acquire() 532 if self.in_use: 533 self.lock.release() 534 raise StreamParseError,"StreamReader.feed() is not reentrant!" 535 self.in_use=1 536 try: 537 return self.reader.feed(s) 538 finally: 539 self.in_use=0 540 self.lock.release()
541 542 543 # vi: sts=4 et sw=4 544