1 # Copyright (C) 2004-2006 Python Software Foundation
2 # Authors: Baxter, Wouters and Warsaw
3 # Contact: email-sig@python.org
4 
5 """FeedParser - An email feed parser.
6 
7 The feed parser implements an interface for incrementally parsing an email
8 message, line by line.  This has advantages for certain applications, such as
9 those reading email messages off a socket.
10 
11 FeedParser.feed() is the primary interface for pushing new data into the
12 parser.  It returns when there's nothing more it can do with the available
13 data.  When you have no more data to push into the parser, call .close().
14 This completes the parsing and returns the root message object.
15 
16 The other advantage of this parser is that it will never throw a parsing
17 exception.  Instead, when it finds something unexpected, it adds a 'defect' to
18 the current message.  Defects are just instances that live on the message
19 object's .defects attribute.
20 """
21 
22 __all__ = ['FeedParser']
23 
24 import re
25 
26 from email import errors
27 from email import message
28 
29 NLCRE = re.compile('\r\n|\r|\n')
30 NLCRE_bol = re.compile('(\r\n|\r|\n)')
31 NLCRE_eol = re.compile('(\r\n|\r|\n)$')
32 NLCRE_crack = re.compile('(\r\n|\r|\n)')
33 # RFC 2822 $3.6.8 Optional fields.  ftext is %d33-57 / %d59-126, Any character
34 # except controls, SP, and ":".
35 headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')
36 EMPTYSTRING = ''
37 NL = '\n'
38 
39 NeedMoreData = object()
40 
41 
42 
43 class BufferedSubFile(object):
44     """A file-ish object that can have new data loaded into it.
45 
46     You can also push and pop line-matching predicates onto a stack.  When the
47     current predicate matches the current line, a false EOF response
48     (i.e. empty string) is returned instead.  This lets the parser adhere to a
49     simple abstraction -- it parses until EOF closes the current message.
50     """
51     def __init__(self):
52         # The last partial line pushed into this object.
53         self._partial = ''
54         # The list of full, pushed lines, in reverse order
55         self._lines = []
56         # The stack of false-EOF checking predicates.
57         self._eofstack = []
58         # A flag indicating whether the file has been closed or not.
59         self._closed = False
60 
61     def push_eof_matcher(self, pred):
62         self._eofstack.append(pred)
63 
64     def pop_eof_matcher(self):
65         return self._eofstack.pop()
66 
67     def close(self):
68         # Don't forget any trailing partial line.
69         self._lines.append(self._partial)
70         self._partial = ''
71         self._closed = True
72 
73     def readline(self):
74         if not self._lines:
75             if self._closed:
76                 return ''
77             return NeedMoreData
78         # Pop the line off the stack and see if it matches the current
79         # false-EOF predicate.
80         line = self._lines.pop()
81         # RFC 2046, section 5.1.2 requires us to recognize outer level
82         # boundaries at any level of inner nesting.  Do this, but be sure it's
83         # in the order of most to least nested.
84         for ateof in self._eofstack[::-1]:
85             if ateof(line):
86                 # We're at the false EOF.  But push the last line back first.
87                 self._lines.append(line)
88                 return ''
89         return line
90 
91     def unreadline(self, line):
92         # Let the consumer push a line back into the buffer.
93         assert line is not NeedMoreData
94         self._lines.append(line)
95 
96     def push(self, data):
97         """Push some new data into this object."""
98         # Handle any previous leftovers
99         data, self._partial = self._partial + data, ''
100         # Crack into lines, but preserve the newlines on the end of each
101         parts = NLCRE_crack.split(data)
102         # The *ahem* interesting behaviour of re.split when supplied grouping
103         # parentheses is that the last element of the resulting list is the
104         # data after the final RE.  In the case of a NL/CR terminated string,
105         # this is the empty string.
106         self._partial = parts.pop()
107         # parts is a list of strings, alternating between the line contents
108         # and the eol character(s).  Gather up a list of lines after
109         # re-attaching the newlines.
110         lines = []
111         for i in range(len(parts) // 2):
112             lines.append(parts[i*2] + parts[i*2+1])
113         self.pushlines(lines)
114 
115     def pushlines(self, lines):
116         # Reverse and insert at the front of the lines.
117         self._lines[:0] = lines[::-1]
118 
119     def is_closed(self):
120         return self._closed
121 
122     def __iter__(self):
123         return self
124 
125     def next(self):
126         line = self.readline()
127         if line == '':
128             raise StopIteration
129         return line
130 
131 
132 
133 class FeedParser:
134     """A feed-style parser of email."""
135 
136     def __init__(self, _factory=message.Message):
137         """_factory is called with no arguments to create a new message obj"""
138         self._factory = _factory
139         self._input = BufferedSubFile()
140         self._msgstack = []
141         self._parse = self._parsegen().next
142         self._cur = None
143         self._last = None
144         self._headersonly = False
145 
146     # Non-public interface for supporting Parser's headersonly flag
147     def _set_headersonly(self):
148         self._headersonly = True
149 
150     def feed(self, data):
151         """Push more data into the parser."""
152         self._input.push(data)
153         self._call_parse()
154 
155     def _call_parse(self):
156         try:
157             self._parse()
158         except StopIteration:
159             pass
160 
161     def close(self):
162         """Parse all remaining data and return the root message object."""
163         self._input.close()
164         self._call_parse()
165         root = self._pop_message()
166         assert not self._msgstack
167         # Look for final set of defects
168         if root.get_content_maintype() == 'multipart' \
169                and not root.is_multipart():
170             root.defects.append(errors.MultipartInvariantViolationDefect())
171         return root
172 
173     def _new_message(self):
174         msg = self._factory()
175         if self._cur and self._cur.get_content_type() == 'multipart/digest':
176             msg.set_default_type('message/rfc822')
177         if self._msgstack:
178             self._msgstack[-1].attach(msg)
179         self._msgstack.append(msg)
180         self._cur = msg
181         self._last = msg
182 
183     def _pop_message(self):
184         retval = self._msgstack.pop()
185         if self._msgstack:
186             self._cur = self._msgstack[-1]
187         else:
188             self._cur = None
189         return retval
190 
191     def _parsegen(self):
192         # Create a new message and start by parsing headers.
193         self._new_message()
194         headers = []
195         # Collect the headers, searching for a line that doesn't match the RFC
196         # 2822 header or continuation pattern (including an empty line).
197         for line in self._input:
198             if line is NeedMoreData:
199                 yield NeedMoreData
200                 continue
201             if not headerRE.match(line):
202                 # If we saw the RFC defined header/body separator
203                 # (i.e. newline), just throw it away. Otherwise the line is
204                 # part of the body so push it back.
205                 if not NLCRE.match(line):
206                     self._input.unreadline(line)
207                 break
208             headers.append(line)
209         # Done with the headers, so parse them and figure out what we're
210         # supposed to see in the body of the message.
211         self._parse_headers(headers)
212         # Headers-only parsing is a backwards compatibility hack, which was
213         # necessary in the older parser, which could throw errors.  All
214         # remaining lines in the input are thrown into the message body.
215         if self._headersonly:
216             lines = []
217             while True:
218                 line = self._input.readline()
219                 if line is NeedMoreData:
220                     yield NeedMoreData
221                     continue
222                 if line == '':
223                     break
224                 lines.append(line)
225             self._cur.set_payload(EMPTYSTRING.join(lines))
226             return
227         if self._cur.get_content_type() == 'message/delivery-status':
228             # message/delivery-status contains blocks of headers separated by
229             # a blank line.  We'll represent each header block as a separate
230             # nested message object, but the processing is a bit different
231             # than standard message/* types because there is no body for the
232             # nested messages.  A blank line separates the subparts.
233             while True:
234                 self._input.push_eof_matcher(NLCRE.match)
235                 for retval in self._parsegen():
236                     if retval is NeedMoreData:
237                         yield NeedMoreData
238                         continue
239                     break
240                 msg = self._pop_message()
241                 # We need to pop the EOF matcher in order to tell if we're at
242                 # the end of the current file, not the end of the last block
243                 # of message headers.
244                 self._input.pop_eof_matcher()
245                 # The input stream must be sitting at the newline or at the
246                 # EOF.  We want to see if we're at the end of this subpart, so
247                 # first consume the blank line, then test the next line to see
248                 # if we're at this subpart's EOF.
249                 while True:
250                     line = self._input.readline()
251                     if line is NeedMoreData:
252                         yield NeedMoreData
253                         continue
254                     break
255                 while True:
256                     line = self._input.readline()
257                     if line is NeedMoreData:
258                         yield NeedMoreData
259                         continue
260                     break
261                 if line == '':
262                     break
263                 # Not at EOF so this is a line we're going to need.
264                 self._input.unreadline(line)
265             return
266         if self._cur.get_content_maintype() == 'message':
267             # The message claims to be a message/* type, then what follows is
268             # another RFC 2822 message.
269             for retval in self._parsegen():
270                 if retval is NeedMoreData:
271                     yield NeedMoreData
272                     continue
273                 break
274             self._pop_message()
275             return
276         if self._cur.get_content_maintype() == 'multipart':
277             boundary = self._cur.get_boundary()
278             if boundary is None:
279                 # The message /claims/ to be a multipart but it has not
280                 # defined a boundary.  That's a problem which we'll handle by
281                 # reading everything until the EOF and marking the message as
282                 # defective.
283                 self._cur.defects.append(errors.NoBoundaryInMultipartDefect())
284                 lines = []
285                 for line in self._input:
286                     if line is NeedMoreData:
287                         yield NeedMoreData
288                         continue
289                     lines.append(line)
290                 self._cur.set_payload(EMPTYSTRING.join(lines))
291                 return
292             # Create a line match predicate which matches the inter-part
293             # boundary as well as the end-of-multipart boundary.  Don't push
294             # this onto the input stream until we've scanned past the
295             # preamble.
296             separator = '--' + boundary
297             boundaryre = re.compile(
298                 '(?P<sep>' + re.escape(separator) +
299                 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
300             capturing_preamble = True
301             preamble = []
302             linesep = False
303             while True:
304                 line = self._input.readline()
305                 if line is NeedMoreData:
306                     yield NeedMoreData
307                     continue
308                 if line == '':
309                     break
310                 mo = boundaryre.match(line)
311                 if mo:
312                     # If we're looking at the end boundary, we're done with
313                     # this multipart.  If there was a newline at the end of
314                     # the closing boundary, then we need to initialize the
315                     # epilogue with the empty string (see below).
316                     if mo.group('end'):
317                         linesep = mo.group('linesep')
318                         break
319                     # We saw an inter-part boundary.  Were we in the preamble?
320                     if capturing_preamble:
321                         if preamble:
322                             # According to RFC 2046, the last newline belongs
323                             # to the boundary.
324                             lastline = preamble[-1]
325                             eolmo = NLCRE_eol.search(lastline)
326                             if eolmo:
327                                 preamble[-1] = lastline[:-len(eolmo.group(0))]
328                             self._cur.preamble = EMPTYSTRING.join(preamble)
329                         capturing_preamble = False
330                         self._input.unreadline(line)
331                         continue
332                     # We saw a boundary separating two parts.  Consume any
333                     # multiple boundary lines that may be following.  Our
334                     # interpretation of RFC 2046 BNF grammar does not produce
335                     # body parts within such double boundaries.
336                     while True:
337                         line = self._input.readline()
338                         if line is NeedMoreData:
339                             yield NeedMoreData
340                             continue
341                         mo = boundaryre.match(line)
342                         if not mo:
343                             self._input.unreadline(line)
344                             break
345                     # Recurse to parse this subpart; the input stream points
346                     # at the subpart's first line.
347                     self._input.push_eof_matcher(boundaryre.match)
348                     for retval in self._parsegen():
349                         if retval is NeedMoreData:
350                             yield NeedMoreData
351                             continue
352                         break
353                     # Because of RFC 2046, the newline preceding the boundary
354                     # separator actually belongs to the boundary, not the
355                     # previous subpart's payload (or epilogue if the previous
356                     # part is a multipart).
357                     if self._last.get_content_maintype() == 'multipart':
358                         epilogue = self._last.epilogue
359                         if epilogue == '':
360                             self._last.epilogue = None
361                         elif epilogue is not None:
362                             mo = NLCRE_eol.search(epilogue)
363                             if mo:
364                                 end = len(mo.group(0))
365                                 self._last.epilogue = epilogue[:-end]
366                     else:
367                         payload = self._last.get_payload()
368                         if isinstance(payload, basestring):
369                             mo = NLCRE_eol.search(payload)
370                             if mo:
371                                 payload = payload[:-len(mo.group(0))]
372                                 self._last.set_payload(payload)
373                     self._input.pop_eof_matcher()
374                     self._pop_message()
375                     # Set the multipart up for newline cleansing, which will
376                     # happen if we're in a nested multipart.
377                     self._last = self._cur
378                 else:
379                     # I think we must be in the preamble
380                     assert capturing_preamble
381                     preamble.append(line)
382             # We've seen either the EOF or the end boundary.  If we're still
383             # capturing the preamble, we never saw the start boundary.  Note
384             # that as a defect and store the captured text as the payload.
385             # Everything from here to the EOF is epilogue.
386             if capturing_preamble:
387                 self._cur.defects.append(errors.StartBoundaryNotFoundDefect())
388                 self._cur.set_payload(EMPTYSTRING.join(preamble))
389                 epilogue = []
390                 for line in self._input:
391                     if line is NeedMoreData:
392                         yield NeedMoreData
393                         continue
394                 self._cur.epilogue = EMPTYSTRING.join(epilogue)
395                 return
396             # If the end boundary ended in a newline, we'll need to make sure
397             # the epilogue isn't None
398             if linesep:
399                 epilogue = ['']
400             else:
401                 epilogue = []
402             for line in self._input:
403                 if line is NeedMoreData:
404                     yield NeedMoreData
405                     continue
406                 epilogue.append(line)
407             # Any CRLF at the front of the epilogue is not technically part of
408             # the epilogue.  Also, watch out for an empty string epilogue,
409             # which means a single newline.
410             if epilogue:
411                 firstline = epilogue[0]
412                 bolmo = NLCRE_bol.match(firstline)
413                 if bolmo:
414                     epilogue[0] = firstline[len(bolmo.group(0)):]
415             self._cur.epilogue = EMPTYSTRING.join(epilogue)
416             return
417         # Otherwise, it's some non-multipart type, so the entire rest of the
418         # file contents becomes the payload.
419         lines = []
420         for line in self._input:
421             if line is NeedMoreData:
422                 yield NeedMoreData
423                 continue
424             lines.append(line)
425         self._cur.set_payload(EMPTYSTRING.join(lines))
426 
427     def _parse_headers(self, lines):
428         # Passed a list of lines that make up the headers for the current msg
429         lastheader = ''
430         lastvalue = []
431         for lineno, line in enumerate(lines):
432             # Check for continuation
433             if line[0] in ' \t':
434                 if not lastheader:
435                     # The first line of the headers was a continuation.  This
436                     # is illegal, so let's note the defect, store the illegal
437                     # line, and ignore it for purposes of headers.
438                     defect = errors.FirstHeaderLineIsContinuationDefect(line)
439                     self._cur.defects.append(defect)
440                     continue
441                 lastvalue.append(line)
442                 continue
443             if lastheader:
444                 # XXX reconsider the joining of folded lines
445                 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
446                 self._cur[lastheader] = lhdr
447                 lastheader, lastvalue = '', []
448             # Check for envelope header, i.e. unix-from
449             if line.startswith('From '):
450                 if lineno == 0:
451                     # Strip off the trailing newline
452                     mo = NLCRE_eol.search(line)
453                     if mo:
454                         line = line[:-len(mo.group(0))]
455                     self._cur.set_unixfrom(line)
456                     continue
457                 elif lineno == len(lines) - 1:
458                     # Something looking like a unix-from at the end - it's
459                     # probably the first line of the body, so push back the
460                     # line and stop.
461                     self._input.unreadline(line)
462                     return
463                 else:
464                     # Weirdly placed unix-from line.  Note this as a defect
465                     # and ignore it.
466                     defect = errors.MisplacedEnvelopeHeaderDefect(line)
467                     self._cur.defects.append(defect)
468                     continue
469             # Split the line on the colon separating field name from value.
470             i = line.find(':')
471             if i < 0:
472                 defect = errors.MalformedHeaderDefect(line)
473                 self._cur.defects.append(defect)
474                 continue
475             lastheader = line[:i]
476             lastvalue = [line[i+1:].lstrip()]
477         # Done with all the lines, so handle the last header.
478         if lastheader:
479             # XXX reconsider the joining of folded lines
480             self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')