| 1
21
22 __all__ = ['FeedParser']
23
24 import re
25
26 from email import errors
27 from email import message
28
29 NLCRE = re.compile('\r\n|\r|\n')
30 NLCRE_bol = re.compile('(\r\n|\r|\n)')
31 NLCRE_eol = re.compile('(\r\n|\r|\n)$')
32 NLCRE_crack = re.compile('(\r\n|\r|\n)')
33 headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')
36 EMPTYSTRING = ''
37 NL = '\n'
38
39 NeedMoreData = object()
40
41
42
43 class BufferedSubFile(object):
44 """A file-ish object that can have new data loaded into it.
45
46 You can also push and pop line-matching predicates onto a stack. When the
47 current predicate matches the current line, a false EOF response
48 (i.e. empty string) is returned instead. This lets the parser adhere to a
49 simple abstraction -- it parses until EOF closes the current message.
50 """
51 def __init__(self):
52 self._partial = ''
54 self._lines = []
56 self._eofstack = []
58 self._closed = False
60
61 def push_eof_matcher(self, pred):
62 self._eofstack.append(pred)
63
64 def pop_eof_matcher(self):
65 return self._eofstack.pop()
66
67 def close(self):
68 self._lines.append(self._partial)
70 self._partial = ''
71 self._closed = True
72
73 def readline(self):
74 if not self._lines:
75 if self._closed:
76 return ''
77 return NeedMoreData
78 line = self._lines.pop()
81 for ateof in self._eofstack[::-1]:
85 if ateof(line):
86 self._lines.append(line)
88 return ''
89 return line
90
91 def unreadline(self, line):
92 assert line is not NeedMoreData
94 self._lines.append(line)
95
96 def push(self, data):
97 """Push some new data into this object."""
98 data, self._partial = self._partial + data, ''
100 parts = NLCRE_crack.split(data)
102 self._partial = parts.pop()
107 lines = []
111 for i in range(len(parts) // 2):
112 lines.append(parts[i*2] + parts[i*2+1])
113 self.pushlines(lines)
114
115 def pushlines(self, lines):
116 self._lines[:0] = lines[::-1]
118
119 def is_closed(self):
120 return self._closed
121
122 def __iter__(self):
123 return self
124
125 def next(self):
126 line = self.readline()
127 if line == '':
128 raise StopIteration
129 return line
130
131
132
133 class FeedParser:
134 """A feed-style parser of email."""
135
136 def __init__(self, _factory=message.Message):
137 """_factory is called with no arguments to create a new message obj"""
138 self._factory = _factory
139 self._input = BufferedSubFile()
140 self._msgstack = []
141 self._parse = self._parsegen().next
142 self._cur = None
143 self._last = None
144 self._headersonly = False
145
146 def _set_headersonly(self):
148 self._headersonly = True
149
150 def feed(self, data):
151 """Push more data into the parser."""
152 self._input.push(data)
153 self._call_parse()
154
155 def _call_parse(self):
156 try:
157 self._parse()
158 except StopIteration:
159 pass
160
161 def close(self):
162 """Parse all remaining data and return the root message object."""
163 self._input.close()
164 self._call_parse()
165 root = self._pop_message()
166 assert not self._msgstack
167 if root.get_content_maintype() == 'multipart' \
169 and not root.is_multipart():
170 root.defects.append(errors.MultipartInvariantViolationDefect())
171 return root
172
173 def _new_message(self):
174 msg = self._factory()
175 if self._cur and self._cur.get_content_type() == 'multipart/digest':
176 msg.set_default_type('message/rfc822')
177 if self._msgstack:
178 self._msgstack[-1].attach(msg)
179 self._msgstack.append(msg)
180 self._cur = msg
181 self._last = msg
182
183 def _pop_message(self):
184 retval = self._msgstack.pop()
185 if self._msgstack:
186 self._cur = self._msgstack[-1]
187 else:
188 self._cur = None
189 return retval
190
191 def _parsegen(self):
192 self._new_message()
194 headers = []
195 for line in self._input:
198 if line is NeedMoreData:
199 yield NeedMoreData
200 continue
201 if not headerRE.match(line):
202 if not NLCRE.match(line):
206 self._input.unreadline(line)
207 break
208 headers.append(line)
209 self._parse_headers(headers)
212 if self._headersonly:
216 lines = []
217 while True:
218 line = self._input.readline()
219 if line is NeedMoreData:
220 yield NeedMoreData
221 continue
222 if line == '':
223 break
224 lines.append(line)
225 self._cur.set_payload(EMPTYSTRING.join(lines))
226 return
227 if self._cur.get_content_type() == 'message/delivery-status':
228 while True:
234 self._input.push_eof_matcher(NLCRE.match)
235 for retval in self._parsegen():
236 if retval is NeedMoreData:
237 yield NeedMoreData
238 continue
239 break
240 msg = self._pop_message()
241 self._input.pop_eof_matcher()
245 while True:
250 line = self._input.readline()
251 if line is NeedMoreData:
252 yield NeedMoreData
253 continue
254 break
255 while True:
256 line = self._input.readline()
257 if line is NeedMoreData:
258 yield NeedMoreData
259 continue
260 break
261 if line == '':
262 break
263 self._input.unreadline(line)
265 return
266 if self._cur.get_content_maintype() == 'message':
267 for retval in self._parsegen():
270 if retval is NeedMoreData:
271 yield NeedMoreData
272 continue
273 break
274 self._pop_message()
275 return
276 if self._cur.get_content_maintype() == 'multipart':
277 boundary = self._cur.get_boundary()
278 if boundary is None:
279 self._cur.defects.append(errors.NoBoundaryInMultipartDefect())
284 lines = []
285 for line in self._input:
286 if line is NeedMoreData:
287 yield NeedMoreData
288 continue
289 lines.append(line)
290 self._cur.set_payload(EMPTYSTRING.join(lines))
291 return
292 separator = '--' + boundary
297 boundaryre = re.compile(
298 '(?P<sep>' + re.escape(separator) +
299 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
300 capturing_preamble = True
301 preamble = []
302 linesep = False
303 while True:
304 line = self._input.readline()
305 if line is NeedMoreData:
306 yield NeedMoreData
307 continue
308 if line == '':
309 break
310 mo = boundaryre.match(line)
311 if mo:
312 if mo.group('end'):
317 linesep = mo.group('linesep')
318 break
319 if capturing_preamble:
321 if preamble:
322 lastline = preamble[-1]
325 eolmo = NLCRE_eol.search(lastline)
326 if eolmo:
327 preamble[-1] = lastline[:-len(eolmo.group(0))]
328 self._cur.preamble = EMPTYSTRING.join(preamble)
329 capturing_preamble = False
330 self._input.unreadline(line)
331 continue
332 while True:
337 line = self._input.readline()
338 if line is NeedMoreData:
339 yield NeedMoreData
340 continue
341 mo = boundaryre.match(line)
342 if not mo:
343 self._input.unreadline(line)
344 break
345 self._input.push_eof_matcher(boundaryre.match)
348 for retval in self._parsegen():
349 if retval is NeedMoreData:
350 yield NeedMoreData
351 continue
352 break
353 if self._last.get_content_maintype() == 'multipart':
358 epilogue = self._last.epilogue
359 if epilogue == '':
360 self._last.epilogue = None
361 elif epilogue is not None:
362 mo = NLCRE_eol.search(epilogue)
363 if mo:
364 end = len(mo.group(0))
365 self._last.epilogue = epilogue[:-end]
366 else:
367 payload = self._last.get_payload()
368 if isinstance(payload, basestring):
369 mo = NLCRE_eol.search(payload)
370 if mo:
371 payload = payload[:-len(mo.group(0))]
372 self._last.set_payload(payload)
373 self._input.pop_eof_matcher()
374 self._pop_message()
375 self._last = self._cur
378 else:
379 assert capturing_preamble
381 preamble.append(line)
382 if capturing_preamble:
387 self._cur.defects.append(errors.StartBoundaryNotFoundDefect())
388 self._cur.set_payload(EMPTYSTRING.join(preamble))
389 epilogue = []
390 for line in self._input:
391 if line is NeedMoreData:
392 yield NeedMoreData
393 continue
394 self._cur.epilogue = EMPTYSTRING.join(epilogue)
395 return
396 if linesep:
399 epilogue = ['']
400 else:
401 epilogue = []
402 for line in self._input:
403 if line is NeedMoreData:
404 yield NeedMoreData
405 continue
406 epilogue.append(line)
407 if epilogue:
411 firstline = epilogue[0]
412 bolmo = NLCRE_bol.match(firstline)
413 if bolmo:
414 epilogue[0] = firstline[len(bolmo.group(0)):]
415 self._cur.epilogue = EMPTYSTRING.join(epilogue)
416 return
417 lines = []
420 for line in self._input:
421 if line is NeedMoreData:
422 yield NeedMoreData
423 continue
424 lines.append(line)
425 self._cur.set_payload(EMPTYSTRING.join(lines))
426
427 def _parse_headers(self, lines):
428 lastheader = ''
430 lastvalue = []
431 for lineno, line in enumerate(lines):
432 if line[0] in ' \t':
434 if not lastheader:
435 defect = errors.FirstHeaderLineIsContinuationDefect(line)
439 self._cur.defects.append(defect)
440 continue
441 lastvalue.append(line)
442 continue
443 if lastheader:
444 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
446 self._cur[lastheader] = lhdr
447 lastheader, lastvalue = '', []
448 if line.startswith('From '):
450 if lineno == 0:
451 mo = NLCRE_eol.search(line)
453 if mo:
454 line = line[:-len(mo.group(0))]
455 self._cur.set_unixfrom(line)
456 continue
457 elif lineno == len(lines) - 1:
458 self._input.unreadline(line)
462 return
463 else:
464 defect = errors.MisplacedEnvelopeHeaderDefect(line)
467 self._cur.defects.append(defect)
468 continue
469 i = line.find(':')
471 if i < 0:
472 defect = errors.MalformedHeaderDefect(line)
473 self._cur.defects.append(defect)
474 continue
475 lastheader = line[:i]
476 lastvalue = [line[i+1:].lstrip()]
477 if lastheader:
479 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')
|