from xml.dom import pulldom import sys def getInnerText(node): """ Get all the inner text of a DOM node (recursively). """ # inspired by http://mail.python.org/pipermail/xml-sig/2005-March/011022.html inner_text = [] for child in node.childNodes: if child.nodeType == child.TEXT_NODE or child.nodeType == child.CDATA_SECTION_NODE: inner_text.append(child.data) elif child.nodeType == child.ELEMENT_NODE: inner_text.extend(getInnerText(child)) else: pass print repr(inner_text) return u"".join(inner_text) class Deserializer(object): def __init__(self, stream): self.stream = stream self.event_stream = pulldom.parse(self.stream) def __iter__(self): return self def next(self): for event, node in self.event_stream: if event == "START_ELEMENT" and node.nodeName == "object": self.event_stream.expandNode(node) for field_node in node.getElementsByTagName("field"): getInnerText(field_node) raise StopIteration fixture = file('unicode.xml', 'r') for elements in Deserializer(fixture): pass