+# or utidylib .
TIDY_MARKUP = 0
+# List of Python interfaces for HTML Tidy, in order of preference. Only useful
+# if TIDY_MARKUP = 1
+PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
+
# ---------- required modules (should come with any Python distribution) ----------
-import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi
+import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
try:
from cStringIO import StringIO as _StringIO
except:
@@ -60,26 +83,6 @@ try:
import zlib
except:
zlib = None
-
-# timeoutsocket allows feedparser to time out rather than hang forever on ultra-slow servers.
-# Python 2.3 now has this functionality available in the standard socket library, so under
-# 2.3 you don't need to install anything. But you probably should anyway, because the socket
-# module is buggy and timeoutsocket is better.
-try:
- import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
- timeoutsocket.setDefaultSocketTimeout(20)
-except ImportError:
- import socket
- if hasattr(socket, 'setdefaulttimeout'):
- socket.setdefaulttimeout(20)
-import urllib, urllib2
-
-_mxtidy = None
-if TIDY_MARKUP:
- try:
- from mx.Tidy import Tidy as _mxtidy
- except:
- pass
# If a real XML parser is available, feedparser will attempt to use it. feedparser has
# been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the
@@ -93,9 +96,9 @@ try:
except:
_XML_AVAILABLE = 0
def _xmlescape(data):
- data = data.replace("&", "&")
- data = data.replace(">", ">")
- data = data.replace("<", "<")
+ data = data.replace('&', '&')
+ data = data.replace('>', '>')
+ data = data.replace('<', '<')
return data
# base64 support for Atom feeds that contain embedded binary data
@@ -115,10 +118,22 @@ try:
except:
pass
+# chardet library auto-detects character encodings
+# Download from http://chardet.feedparser.org/
+try:
+ import chardet
+ if _debug:
+ import chardet.constants
+ chardet.constants._debug = 1
+except:
+ chardet = None
+
# ---------- don't touch these ----------
-class CharacterEncodingOverride(Exception): pass
-class CharacterEncodingUnknown(Exception): pass
-class NonXMLContentType(Exception): pass
+class ThingsNobodyCaresAboutButMe(Exception): pass
+class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
+class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
+class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
+class UndeclaredNamespace(Exception): pass
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
sgmllib.special = re.compile('" % (tag, "".join([' %s="%s"' % t for t in attrs])), escape=0)
+ return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0)
# match namespaces
if tag.find(':') <> -1:
@@ -368,7 +447,7 @@ class _FeedParserMixin:
# special hack for better tracking of empty textinput/image elements in illformed feeds
if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
self.intextinput = 0
- if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'width', 'height'):
+ if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
self.inimage = 0
# call special handler (if defined) or default handler
@@ -399,12 +478,12 @@ class _FeedParserMixin:
self.pop(prefix + suffix)
# track inline content
- if self.incontent and self.contentparams.get('mode') == 'escaped':
+ if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
# element declared itself as escaped markup, but it isn't really
- self.contentparams['mode'] = 'xml'
- if self.incontent and self.contentparams.get('mode') == 'xml':
+ self.contentparams['type'] = 'application/xhtml+xml'
+ if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
tag = tag.split(':')[-1]
- self.handle_data("%s>" % tag, escape=0)
+ self.handle_data('%s>' % tag, escape=0)
# track xml:base and xml:lang going out of scope
if self.basestack:
@@ -417,11 +496,11 @@ class _FeedParserMixin:
self.lang = self.langstack[-1]
def handle_charref(self, ref):
- # called for each character reference, e.g. for " ", ref will be "160"
+ # called for each character reference, e.g. for ' ', ref will be '160'
if not self.elementstack: return
ref = ref.lower()
if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
- text = "%s;" % ref
+ text = '%s;' % ref
else:
if ref[0] == 'x':
c = int(ref[1:], 16)
@@ -431,23 +510,23 @@ class _FeedParserMixin:
self.elementstack[-1][2].append(text)
def handle_entityref(self, ref):
- # called for each entity reference, e.g. for "©", ref will be "copy"
+ # called for each entity reference, e.g. for '©', ref will be 'copy'
if not self.elementstack: return
- if _debug: sys.stderr.write("entering handle_entityref with %s\n" % ref)
+ if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
text = '&%s;' % ref
else:
# entity resolution graciously donated by Aaron Swartz
def name2cp(k):
import htmlentitydefs
- if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
+ if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
return htmlentitydefs.name2codepoint[k]
k = htmlentitydefs.entitydefs[k]
- if k.startswith("") and k.endswith(";"):
+ if k.startswith('') and k.endswith(';'):
return int(k[2:-1]) # not in latin-1
return ord(k)
try: name2cp(ref)
- except KeyError: text = "&%s;" % ref
+ except KeyError: text = '&%s;' % ref
else: text = unichr(name2cp(ref)).encode('utf-8')
self.elementstack[-1][2].append(text)
@@ -455,7 +534,7 @@ class _FeedParserMixin:
# called for each block of plain text, i.e. outside of any tag and
# not containing any character or entity references
if not self.elementstack: return
- if escape and self.contentparams.get('mode') == 'xml':
+ if escape and self.contentparams.get('type') == 'application/xhtml+xml':
text = _xmlescape(text)
self.elementstack[-1][2].append(text)
@@ -472,7 +551,7 @@ class _FeedParserMixin:
def parse_declaration(self, i):
# override internal declaration handler to handle CDATA blocks
- if _debug: sys.stderr.write("entering parse_declaration\n")
+ if _debug: sys.stderr.write('entering parse_declaration\n')
if self.rawdata[i:i+9] == '', i)
if k == -1: k = len(self.rawdata)
@@ -482,20 +561,36 @@ class _FeedParserMixin:
k = self.rawdata.find('>', i)
return k+1
+ def mapContentType(self, contentType):
+ contentType = contentType.lower()
+ if contentType == 'text':
+ contentType = 'text/plain'
+ elif contentType == 'html':
+ contentType = 'text/html'
+ elif contentType == 'xhtml':
+ contentType = 'application/xhtml+xml'
+ return contentType
+
def trackNamespace(self, prefix, uri):
- if (prefix, uri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
+ loweruri = uri.lower()
+ if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
self.version = 'rss090'
- if uri == 'http://purl.org/rss/1.0/' and not self.version:
+ if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
self.version = 'rss10'
- if not prefix: return
- if uri.find('backend.userland.com/rss') <> -1:
+ if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
+ self.version = 'atom10'
+ if loweruri.find('backend.userland.com/rss') <> -1:
# match any backend.userland.com namespace
uri = 'http://backend.userland.com/rss'
- if self.namespaces.has_key(uri):
- self.namespacemap[prefix] = self.namespaces[uri]
+ loweruri = uri
+ if self._matchnamespaces.has_key(loweruri):
+ self.namespacemap[prefix] = self._matchnamespaces[loweruri]
+ self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
+ else:
+ self.namespacesInUse[prefix or ''] = uri
def resolveURI(self, uri):
- return urlparse.urljoin(self.baseuri or '', uri)
+ return _urljoin(self.baseuri or '', uri)
def decodeEntities(self, element, data):
return data
@@ -503,17 +598,18 @@ class _FeedParserMixin:
def push(self, element, expectingText):
self.elementstack.append([element, expectingText, []])
- def pop(self, element):
+ def pop(self, element, stripWhitespace=1):
if not self.elementstack: return
if self.elementstack[-1][0] != element: return
-
- element, expectingText, pieces = self.elementstack.pop()
- output = "".join(pieces)
- output = output.strip()
- if not expectingText: return output
+ element, expectingText, pieces = self.elementstack.pop()
+ output = ''.join(pieces)
+ if stripWhitespace:
+ output = output.strip()
+ if not expectingText: return output
+
# decode base64 content
- if self.contentparams.get('mode') == 'base64' and base64:
+ if base64 and self.contentparams.get('base64', 0):
try:
output = base64.decodestring(output)
except binascii.Error:
@@ -526,37 +622,46 @@ class _FeedParserMixin:
output = self.resolveURI(output)
# decode entities within embedded markup
- output = self.decodeEntities(element, output)
+ if not self.contentparams.get('base64', 0):
+ output = self.decodeEntities(element, output)
+
+ # remove temporary cruft from contentparams
+ try:
+ del self.contentparams['mode']
+ except KeyError:
+ pass
+ try:
+ del self.contentparams['base64']
+ except KeyError:
+ pass
# resolve relative URIs within embedded markup
- if self.contentparams.get('type', 'text/html') in self.html_types:
+ if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
if element in self.can_contain_relative_uris:
output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
# sanitize embedded markup
- if self.contentparams.get('type', 'text/html') in self.html_types:
+ if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
if element in self.can_contain_dangerous_markup:
output = _sanitizeHTML(output, self.encoding)
- if self.encoding and (type(output) == types.StringType):
+ if self.encoding and type(output) != type(u''):
try:
output = unicode(output, self.encoding)
except:
pass
-
+
+ # categories/tags/keywords/whatever are handled in _end_category
+ if element == 'category':
+ return output
+
# store output in appropriate place(s)
- if self.inentry:
+ if self.inentry and not self.insource:
if element == 'content':
self.entries[-1].setdefault(element, [])
contentparams = copy.deepcopy(self.contentparams)
contentparams['value'] = output
self.entries[-1][element].append(contentparams)
- elif element == 'category':
- self.entries[-1][element] = output
- domain = self.entries[-1]['categories'][-1][0]
- self.entries[-1]['categories'][-1] = (domain, output)
- elif element == 'source':
- self.entries[-1]['source']['value'] = output
elif element == 'link':
self.entries[-1][element] = output
if output:
@@ -569,21 +674,34 @@ class _FeedParserMixin:
contentparams = copy.deepcopy(self.contentparams)
contentparams['value'] = output
self.entries[-1][element + '_detail'] = contentparams
- elif self.infeed and (not self.intextinput) and (not self.inimage):
+ elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage):
+ context = self._getContext()
if element == 'description':
- element = 'tagline'
- self.feeddata[element] = output
- if element == 'category':
- domain = self.feeddata['categories'][-1][0]
- self.feeddata['categories'][-1] = (domain, output)
- elif element == 'link':
- self.feeddata['links'][-1]['href'] = output
+ element = 'subtitle'
+ context[element] = output
+ if element == 'link':
+ context['links'][-1]['href'] = output
elif self.incontent:
contentparams = copy.deepcopy(self.contentparams)
contentparams['value'] = output
- self.feeddata[element + '_detail'] = contentparams
+ context[element + '_detail'] = contentparams
return output
+ def pushContent(self, tag, attrsD, defaultContentType, expectingText):
+ self.incontent += 1
+ self.contentparams = FeedParserDict({
+ 'type': self.mapContentType(attrsD.get('type', defaultContentType)),
+ 'language': self.lang,
+ 'base': self.baseuri})
+ self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
+ self.push(tag, expectingText)
+
+ def popContent(self, tag):
+ value = self.pop(tag)
+ self.incontent -= 1
+ self.contentparams.clear()
+ return value
+
def _mapToStandardPrefix(self, name):
colonpos = name.find(':')
if colonpos <> -1:
@@ -596,11 +714,34 @@ class _FeedParserMixin:
def _getAttribute(self, attrsD, name):
return attrsD.get(self._mapToStandardPrefix(name))
+ def _isBase64(self, attrsD, contentparams):
+ if attrsD.get('mode', '') == 'base64':
+ return 1
+ if self.contentparams['type'].startswith('text/'):
+ return 0
+ if self.contentparams['type'].endswith('+xml'):
+ return 0
+ if self.contentparams['type'].endswith('/xml'):
+ return 0
+ return 1
+
+ def _itsAnHrefDamnIt(self, attrsD):
+ href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
+ if href:
+ try:
+ del attrsD['url']
+ except KeyError:
+ pass
+ try:
+ del attrsD['uri']
+ except KeyError:
+ pass
+ attrsD['href'] = href
+ return attrsD
+
def _save(self, key, value):
- if self.inentry:
- self.entries[-1].setdefault(key, value)
- elif self.feeddata:
- self.feeddata.setdefault(key, value)
+ context = self._getContext()
+ context.setdefault(key, value)
def _start_rss(self, attrsD):
versionmap = {'0.91': 'rss091u',
@@ -661,7 +802,7 @@ class _FeedParserMixin:
def _end_image(self):
self.pop('image')
self.inimage = 0
-
+
def _start_textinput(self, attrsD):
self.intextinput = 1
self.push('textinput', 0)
@@ -680,6 +821,7 @@ class _FeedParserMixin:
_start_managingeditor = _start_author
_start_dc_author = _start_author
_start_dc_creator = _start_author
+ _start_itunes_author = _start_author
def _end_author(self):
self.pop('author')
@@ -688,6 +830,16 @@ class _FeedParserMixin:
_end_managingeditor = _end_author
_end_dc_author = _end_author
_end_dc_creator = _end_author
+ _end_itunes_author = _end_author
+
+ def _start_itunes_owner(self, attrsD):
+ self.inpublisher = 1
+ self.push('publisher', 0)
+
+ def _end_itunes_owner(self):
+ self.pop('publisher')
+ self.inpublisher = 0
+ self._sync_author_detail('publisher')
def _start_contributor(self, attrsD):
self.incontributor = 1
@@ -699,19 +851,34 @@ class _FeedParserMixin:
def _end_contributor(self):
self.pop('contributor')
self.incontributor = 0
-
+
+ def _start_dc_contributor(self, attrsD):
+ self.incontributor = 1
+ context = self._getContext()
+ context.setdefault('contributors', [])
+ context['contributors'].append(FeedParserDict())
+ self.push('name', 0)
+
+ def _end_dc_contributor(self):
+ self._end_name()
+ self.incontributor = 0
+
def _start_name(self, attrsD):
self.push('name', 0)
+ _start_itunes_name = _start_name
def _end_name(self):
value = self.pop('name')
- if self.inauthor:
+ if self.inpublisher:
+ self._save_author('name', value, 'publisher')
+ elif self.inauthor:
self._save_author('name', value)
elif self.incontributor:
self._save_contributor('name', value)
elif self.intextinput:
context = self._getContext()
context['textinput']['name'] = value
+ _end_itunes_name = _end_name
def _start_width(self, attrsD):
self.push('width', 0)
@@ -740,19 +907,19 @@ class _FeedParserMixin:
context['image']['height'] = value
def _start_url(self, attrsD):
- self.push('url', 1)
+ self.push('href', 1)
_start_homepage = _start_url
_start_uri = _start_url
def _end_url(self):
- value = self.pop('url')
+ value = self.pop('href')
if self.inauthor:
- self._save_author('url', value)
+ self._save_author('href', value)
elif self.incontributor:
- self._save_contributor('url', value)
+ self._save_contributor('href', value)
elif self.inimage:
context = self._getContext()
- context['image']['url'] = value
+ context['image']['href'] = value
elif self.intextinput:
context = self._getContext()
context['textinput']['link'] = value
@@ -761,26 +928,31 @@ class _FeedParserMixin:
def _start_email(self, attrsD):
self.push('email', 0)
+ _start_itunes_email = _start_email
def _end_email(self):
value = self.pop('email')
- if self.inauthor:
+ if self.inpublisher:
+ self._save_author('email', value, 'publisher')
+ elif self.inauthor:
self._save_author('email', value)
elif self.incontributor:
self._save_contributor('email', value)
- pass
+ _end_itunes_email = _end_email
def _getContext(self):
- if self.inentry:
+ if self.insource:
+ context = self.sourcedata
+ elif self.inentry:
context = self.entries[-1]
else:
context = self.feeddata
return context
- def _save_author(self, key, value):
+ def _save_author(self, key, value, prefix='author'):
context = self._getContext()
- context.setdefault('author_detail', FeedParserDict())
- context['author_detail'][key] = value
+ context.setdefault(prefix + '_detail', FeedParserDict())
+ context[prefix + '_detail'][key] = value
self._sync_author_detail()
def _save_contributor(self, key, value):
@@ -795,7 +967,7 @@ class _FeedParserMixin:
name = detail.get('name')
email = detail.get('email')
if name and email:
- context[key] = "%s (%s)" % (name, email)
+ context[key] = '%s (%s)' % (name, email)
elif name:
context[key] = name
elif email:
@@ -803,7 +975,7 @@ class _FeedParserMixin:
else:
author = context.get(key)
if not author: return
- emailmatch = re.search(r"""(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))""", author)
+ emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)
if not emailmatch: return
email = emailmatch.group(0)
# probably a better way to do the following, but it passes all the tests
@@ -818,38 +990,26 @@ class _FeedParserMixin:
context.setdefault('%s_detail' % key, FeedParserDict())
context['%s_detail' % key]['name'] = author
context['%s_detail' % key]['email'] = email
-
- def _start_tagline(self, attrsD):
- self.incontent += 1
- self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
- 'type': attrsD.get('type', 'text/plain'),
- 'language': self.lang,
- 'base': self.baseuri})
- self.push('tagline', 1)
- _start_subtitle = _start_tagline
- def _end_tagline(self):
- value = self.pop('tagline')
- self.incontent -= 1
- self.contentparams.clear()
- if self.infeed:
- self.feeddata['description'] = value
- _end_subtitle = _end_tagline
-
- def _start_copyright(self, attrsD):
- self.incontent += 1
- self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
- 'type': attrsD.get('type', 'text/plain'),
- 'language': self.lang,
- 'base': self.baseuri})
- self.push('copyright', 1)
- _start_dc_rights = _start_copyright
+ def _start_subtitle(self, attrsD):
+ self.pushContent('subtitle', attrsD, 'text/plain', 1)
+ _start_tagline = _start_subtitle
+ _start_itunes_subtitle = _start_subtitle
- def _end_copyright(self):
- self.pop('copyright')
- self.incontent -= 1
- self.contentparams.clear()
- _end_dc_rights = _end_copyright
+ def _end_subtitle(self):
+ self.popContent('subtitle')
+ _end_tagline = _end_subtitle
+ _end_itunes_subtitle = _end_subtitle
+
+ def _start_rights(self, attrsD):
+ self.pushContent('rights', attrsD, 'text/plain', 1)
+ _start_dc_rights = _start_rights
+ _start_copyright = _start_rights
+
+ def _end_rights(self):
+ self.popContent('rights')
+ _end_dc_rights = _end_rights
+ _end_copyright = _end_rights
def _start_item(self, attrsD):
self.entries.append(FeedParserDict())
@@ -885,38 +1045,42 @@ class _FeedParserMixin:
self.pop('publisher')
self._sync_author_detail('publisher')
_end_webmaster = _end_dc_publisher
-
- def _start_dcterms_issued(self, attrsD):
- self.push('issued', 1)
- _start_issued = _start_dcterms_issued
- def _end_dcterms_issued(self):
- value = self.pop('issued')
- self._save('issued_parsed', _parse_date(value))
- _end_issued = _end_dcterms_issued
+ def _start_published(self, attrsD):
+ self.push('published', 1)
+ _start_dcterms_issued = _start_published
+ _start_issued = _start_published
- def _start_dcterms_created(self, attrsD):
+ def _end_published(self):
+ value = self.pop('published')
+ self._save('published_parsed', _parse_date(value))
+ _end_dcterms_issued = _end_published
+ _end_issued = _end_published
+
+ def _start_updated(self, attrsD):
+ self.push('updated', 1)
+ _start_modified = _start_updated
+ _start_dcterms_modified = _start_updated
+ _start_pubdate = _start_updated
+ _start_dc_date = _start_updated
+
+ def _end_updated(self):
+ value = self.pop('updated')
+ parsed_value = _parse_date(value)
+ self._save('updated_parsed', parsed_value)
+ _end_modified = _end_updated
+ _end_dcterms_modified = _end_updated
+ _end_pubdate = _end_updated
+ _end_dc_date = _end_updated
+
+ def _start_created(self, attrsD):
self.push('created', 1)
- _start_created = _start_dcterms_created
+ _start_dcterms_created = _start_created
- def _end_dcterms_created(self):
+ def _end_created(self):
value = self.pop('created')
self._save('created_parsed', _parse_date(value))
- _end_created = _end_dcterms_created
-
- def _start_dcterms_modified(self, attrsD):
- self.push('modified', 1)
- _start_modified = _start_dcterms_modified
- _start_dc_date = _start_dcterms_modified
- _start_pubdate = _start_dcterms_modified
-
- def _end_dcterms_modified(self):
- value = self.pop('modified')
- parsed_value = _parse_date(value)
- self._save('modified_parsed', parsed_value)
- _end_modified = _end_dcterms_modified
- _end_dc_date = _end_dcterms_modified
- _end_pubdate = _end_dcterms_modified
+ _end_dcterms_created = _end_created
def _start_expirationdate(self, attrsD):
self.push('expired', 1)
@@ -937,56 +1101,74 @@ class _FeedParserMixin:
def _end_creativecommons_license(self):
self.pop('license')
+ def _addTag(self, term, scheme, label):
+ context = self._getContext()
+ tags = context.setdefault('tags', [])
+ if (not term) and (not scheme) and (not label): return
+ value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
+ if value not in tags:
+ tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))
+
def _start_category(self, attrsD):
+ if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
+ term = attrsD.get('term')
+ scheme = attrsD.get('scheme', attrsD.get('domain'))
+ label = attrsD.get('label')
+ self._addTag(term, scheme, label)
self.push('category', 1)
- domain = self._getAttribute(attrsD, 'domain')
- cats = []
- if self.inentry:
- cats = self.entries[-1].setdefault('categories', [])
- elif self.infeed:
- cats = self.feeddata.setdefault('categories', [])
- cats.append((domain, None))
_start_dc_subject = _start_category
_start_keywords = _start_category
+ def _end_itunes_keywords(self):
+ for term in self.pop('itunes_keywords').split():
+ self._addTag(term, 'http://www.itunes.com/', None)
+
+ def _start_itunes_category(self, attrsD):
+ self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
+ self.push('category', 1)
+
def _end_category(self):
- self.pop('category')
+ value = self.pop('category')
+ if not value: return
+ context = self._getContext()
+ tags = context['tags']
+ if value and len(tags) and not tags[-1]['term']:
+ tags[-1]['term'] = value
+ else:
+ self._addTag(value, None, None)
_end_dc_subject = _end_category
_end_keywords = _end_category
-
+ _end_itunes_category = _end_category
+
def _start_cloud(self, attrsD):
- self.feeddata['cloud'] = FeedParserDict(attrsD)
+ self._getContext()['cloud'] = FeedParserDict(attrsD)
def _start_link(self, attrsD):
attrsD.setdefault('rel', 'alternate')
attrsD.setdefault('type', 'text/html')
+ attrsD = self._itsAnHrefDamnIt(attrsD)
if attrsD.has_key('href'):
attrsD['href'] = self.resolveURI(attrsD['href'])
- expectingText = self.infeed or self.inentry
- if self.inentry:
- self.entries[-1].setdefault('links', [])
- self.entries[-1]['links'].append(FeedParserDict(attrsD))
- elif self.infeed:
- self.feeddata.setdefault('links', [])
- self.feeddata['links'].append(FeedParserDict(attrsD))
+ expectingText = self.infeed or self.inentry or self.insource
+ context = self._getContext()
+ context.setdefault('links', [])
+ context['links'].append(FeedParserDict(attrsD))
+ if attrsD['rel'] == 'enclosure':
+ self._start_enclosure(attrsD)
if attrsD.has_key('href'):
expectingText = 0
- if attrsD.get('type', '') in self.html_types:
- if self.inentry:
- self.entries[-1]['link'] = attrsD['href']
- elif self.infeed:
- self.feeddata['link'] = attrsD['href']
+ if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
+ context['link'] = attrsD['href']
else:
self.push('link', expectingText)
_start_producturl = _start_link
def _end_link(self):
value = self.pop('link')
+ context = self._getContext()
if self.intextinput:
- context = self._getContext()
context['textinput']['link'] = value
if self.inimage:
- context = self._getContext()
context['image']['link'] = value
_end_producturl = _end_link
@@ -998,89 +1180,70 @@ class _FeedParserMixin:
value = self.pop('id')
self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
if self.guidislink:
- # guid acts as link, but only if "ispermalink" is not present or is "true",
+ # guid acts as link, but only if 'ispermalink' is not present or is 'true',
# and only if the item doesn't already have a link element
self._save('link', value)
- def _start_id(self, attrsD):
- self.push('id', 1)
-
- def _end_id(self):
- value = self.pop('id')
-
def _start_title(self, attrsD):
- self.incontent += 1
- if _debug: sys.stderr.write('attrsD.xml:lang = %s\n' % attrsD.get('xml:lang'))
- if _debug: sys.stderr.write('self.lang = %s\n' % self.lang)
- self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
- 'type': attrsD.get('type', 'text/plain'),
- 'language': self.lang,
- 'base': self.baseuri})
- self.push('title', self.infeed or self.inentry)
+ self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
_start_dc_title = _start_title
+ _start_media_title = _start_title
def _end_title(self):
- value = self.pop('title')
- self.incontent -= 1
- self.contentparams.clear()
- if self.intextinput:
- context = self._getContext()
- context['textinput']['title'] = value
- elif self.inimage:
- context = self._getContext()
- context['image']['title'] = value
- _end_dc_title = _end_title
-
- def _start_description(self, attrsD, default_content_type='text/html'):
- self.incontent += 1
- self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
- 'type': attrsD.get('type', default_content_type),
- 'language': self.lang,
- 'base': self.baseuri})
- self.push('description', self.infeed or self.inentry)
-
- def _start_abstract(self, attrsD):
- return self._start_description(attrsD, 'text/plain')
-
- def _end_description(self):
- value = self.pop('description')
- self.incontent -= 1
- self.contentparams.clear()
+ value = self.popContent('title')
context = self._getContext()
if self.intextinput:
- context['textinput']['description'] = value
+ context['textinput']['title'] = value
elif self.inimage:
- context['image']['description'] = value
-# elif self.inentry:
-# context['summary'] = value
-# elif self.infeed:
-# context['tagline'] = value
+ context['image']['title'] = value
+ _end_dc_title = _end_title
+ _end_media_title = _end_title
+
+ def _start_description(self, attrsD):
+ context = self._getContext()
+ if context.has_key('summary'):
+ self._summaryKey = 'content'
+ self._start_content(attrsD)
+ else:
+ self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
+
+ def _start_abstract(self, attrsD):
+ self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
+
+ def _end_description(self):
+ if self._summaryKey == 'content':
+ self._end_content()
+ else:
+ value = self.popContent('description')
+ context = self._getContext()
+ if self.intextinput:
+ context['textinput']['description'] = value
+ elif self.inimage:
+ context['image']['description'] = value
+ self._summaryKey = None
_end_abstract = _end_description
def _start_info(self, attrsD):
- self.incontent += 1
- self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
- 'type': attrsD.get('type', 'text/plain'),
- 'language': self.lang,
- 'base': self.baseuri})
- self.push('info', 1)
+ self.pushContent('info', attrsD, 'text/plain', 1)
+ _start_feedburner_browserfriendly = _start_info
def _end_info(self):
- self.pop('info')
- self.incontent -= 1
- self.contentparams.clear()
+ self.popContent('info')
+ _end_feedburner_browserfriendly = _end_info
def _start_generator(self, attrsD):
if attrsD:
- if attrsD.has_key('url'):
- attrsD['url'] = self.resolveURI(attrsD['url'])
- self.feeddata['generator_detail'] = FeedParserDict(attrsD)
+ attrsD = self._itsAnHrefDamnIt(attrsD)
+ if attrsD.has_key('href'):
+ attrsD['href'] = self.resolveURI(attrsD['href'])
+ self._getContext()['generator_detail'] = FeedParserDict(attrsD)
self.push('generator', 1)
def _end_generator(self):
value = self.pop('generator')
- if self.feeddata.has_key('generator_detail'):
- self.feeddata['generator_detail']['name'] = value
+ context = self._getContext()
+ if context.has_key('generator_detail'):
+ context['generator_detail']['name'] = value
def _start_admin_generatoragent(self, attrsD):
self.push('generator', 1)
@@ -1088,7 +1251,7 @@ class _FeedParserMixin:
if value:
self.elementstack[-1][2].append(value)
self.pop('generator')
- self.feeddata['generator_detail'] = FeedParserDict({"url": value})
+ self._getContext()['generator_detail'] = FeedParserDict({'href': value})
def _start_admin_errorreportsto(self, attrsD):
self.push('errorreportsto', 1)
@@ -1098,79 +1261,82 @@ class _FeedParserMixin:
self.pop('errorreportsto')
def _start_summary(self, attrsD):
- self.incontent += 1
- self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
- 'type': attrsD.get('type', 'text/plain'),
- 'language': self.lang,
- 'base': self.baseuri})
- self.push('summary', 1)
+ context = self._getContext()
+ if context.has_key('summary'):
+ self._summaryKey = 'content'
+ self._start_content(attrsD)
+ else:
+ self._summaryKey = 'summary'
+ self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
+ _start_itunes_summary = _start_summary
def _end_summary(self):
- value = self.pop('summary')
- if self.entries:
- self.entries[-1]['description'] = value
- self.incontent -= 1
- self.contentparams.clear()
+ if self._summaryKey == 'content':
+ self._end_content()
+ else:
+ self.popContent(self._summaryKey or 'summary')
+ self._summaryKey = None
+ _end_itunes_summary = _end_summary
def _start_enclosure(self, attrsD):
- if self.inentry:
- self.entries[-1].setdefault('enclosures', [])
- self.entries[-1]['enclosures'].append(FeedParserDict(attrsD))
+ attrsD = self._itsAnHrefDamnIt(attrsD)
+ self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))
+ href = attrsD.get('href')
+ if href:
+ context = self._getContext()
+ if not context.get('id'):
+ context['id'] = href
def _start_source(self, attrsD):
- if self.inentry:
- self.entries[-1]['source'] = FeedParserDict(attrsD)
- self.push('source', 1)
+ self.insource = 1
def _end_source(self):
- self.pop('source')
+ self.insource = 0
+ self._getContext()['source'] = copy.deepcopy(self.sourcedata)
+ self.sourcedata.clear()
def _start_content(self, attrsD):
- self.incontent += 1
- self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'),
- 'type': attrsD.get('type', 'text/plain'),
- 'language': self.lang,
- 'base': self.baseuri})
+ self.pushContent('content', attrsD, 'text/plain', 1)
+ src = attrsD.get('src')
+ if src:
+ self.contentparams['src'] = src
self.push('content', 1)
def _start_prodlink(self, attrsD):
- self.incontent += 1
- self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'),
- 'type': attrsD.get('type', 'text/html'),
- 'language': self.lang,
- 'base': self.baseuri})
- self.push('content', 1)
+ self.pushContent('content', attrsD, 'text/html', 1)
def _start_body(self, attrsD):
- self.incontent += 1
- self.contentparams = FeedParserDict({'mode': 'xml',
- 'type': 'application/xhtml+xml',
- 'language': self.lang,
- 'base': self.baseuri})
- self.push('content', 1)
+ self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
_start_xhtml_body = _start_body
def _start_content_encoded(self, attrsD):
- self.incontent += 1
- self.contentparams = FeedParserDict({'mode': 'escaped',
- 'type': 'text/html',
- 'language': self.lang,
- 'base': self.baseuri})
- self.push('content', 1)
+ self.pushContent('content', attrsD, 'text/html', 1)
_start_fullitem = _start_content_encoded
def _end_content(self):
- value = self.pop('content')
- if self.contentparams.get('type') in (['text/plain'] + self.html_types):
+ copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
+ value = self.popContent('content')
+ if copyToDescription:
self._save('description', value)
- self.incontent -= 1
- self.contentparams.clear()
_end_body = _end_content
_end_xhtml_body = _end_content
_end_content_encoded = _end_content
_end_fullitem = _end_content
_end_prodlink = _end_content
+ def _start_itunes_image(self, attrsD):
+ self.push('itunes_image', 0)
+ self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
+ _start_itunes_link = _start_itunes_image
+
+ def _end_itunes_block(self):
+ value = self.pop('itunes_block', 0)
+ self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
+
+ def _end_itunes_explicit(self):
+ value = self.pop('itunes_explicit', 0)
+ self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
+
if _XML_AVAILABLE:
class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
def __init__(self, baseuri, baselang, encoding):
@@ -1185,14 +1351,22 @@ if _XML_AVAILABLE:
def startElementNS(self, name, qname, attrs):
namespace, localname = name
- namespace = str(namespace or '')
- if namespace.find('backend.userland.com/rss') <> -1:
+ lowernamespace = str(namespace or '').lower()
+ if lowernamespace.find('backend.userland.com/rss') <> -1:
# match any backend.userland.com namespace
namespace = 'http://backend.userland.com/rss'
- prefix = self.namespaces.get(namespace, 'unknown')
+ lowernamespace = namespace
+ if qname and qname.find(':') > 0:
+ givenprefix = qname.split(':')[0]
+ else:
+ givenprefix = None
+ prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
+ if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
+ raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
if prefix:
localname = prefix + ':' + localname
localname = str(localname).lower()
+ if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
# qname implementation is horribly broken in Python 2.1 (it
# doesn't report any), and slightly broken in Python 2.2 (it
@@ -1203,24 +1377,26 @@ if _XML_AVAILABLE:
# tirelessly telling me that it didn't work yet.
attrsD = {}
for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
- prefix = self.namespaces.get(namespace, '')
+ lowernamespace = (namespace or '').lower()
+ prefix = self._matchnamespaces.get(lowernamespace, '')
if prefix:
- attrlocalname = prefix + ":" + attrlocalname
+ attrlocalname = prefix + ':' + attrlocalname
attrsD[str(attrlocalname).lower()] = attrvalue
for qname in attrs.getQNames():
attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
self.unknown_starttag(localname, attrsD.items())
-# def resolveEntity(self, publicId, systemId):
-# return _StringIO()
-
def characters(self, text):
self.handle_data(text)
def endElementNS(self, name, qname):
namespace, localname = name
- namespace = str(namespace)
- prefix = self.namespaces.get(namespace, '')
+ lowernamespace = str(namespace or '').lower()
+ if qname and qname.find(':') > 0:
+ givenprefix = qname.split(':')[0]
+ else:
+ givenprefix = ''
+ prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
if prefix:
localname = prefix + ':' + localname
localname = str(localname).lower()
@@ -1247,50 +1423,61 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
self.pieces = []
sgmllib.SGMLParser.reset(self)
+ def _shorttag_replace(self, match):
+ tag = match.group(1)
+ if tag in self.elements_no_end_tag:
+ return '<' + tag + ' />'
+ else:
+ return '<' + tag + '>' + tag + '>'
+
def feed(self, data):
data = re.compile(r'', r'<\1>\1>', data)
+ #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
+ data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data)
data = data.replace(''', "'")
data = data.replace('"', '"')
- if self.encoding and (type(data) == types.UnicodeType):
+ if self.encoding and type(data) == type(u''):
data = data.encode(self.encoding)
sgmllib.SGMLParser.feed(self, data)
def normalize_attrs(self, attrs):
# utility method to be called by descendants
attrs = [(k.lower(), v) for k, v in attrs]
-# if self.encoding:
-# if _debug: sys.stderr.write('normalize_attrs, encoding=%s\n' % self.encoding)
-# attrs = [(k, v.encode(self.encoding)) for k, v in attrs]
attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
return attrs
def unknown_starttag(self, tag, attrs):
# called for each start tag
# attrs is a list of (attr, value) tuples
- # e.g. for , tag="pre", attrs=[("class", "screen")]
+ # e.g. for , tag='pre', attrs=[('class', 'screen')]
if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
- strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
+ uattrs = []
+ # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
+ for key, value in attrs:
+ if type(value) != type(u''):
+ value = unicode(value, self.encoding)
+ uattrs.append((unicode(key, self.encoding), value))
+ strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
if tag in self.elements_no_end_tag:
- self.pieces.append("<%(tag)s%(strattrs)s />" % locals())
+ self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
else:
- self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
-
+ self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
+
def unknown_endtag(self, tag):
- # called for each end tag, e.g. for
, tag will be "pre"
+ # called for each end tag, e.g. for
, tag will be 'pre'
# Reconstruct the original end tag.
if tag not in self.elements_no_end_tag:
self.pieces.append("%(tag)s>" % locals())
def handle_charref(self, ref):
- # called for each character reference, e.g. for " ", ref will be "160"
+ # called for each character reference, e.g. for ' ', ref will be '160'
# Reconstruct the original character reference.
- self.pieces.append("%(ref)s;" % locals())
+ self.pieces.append('%(ref)s;' % locals())
def handle_entityref(self, ref):
- # called for each entity reference, e.g. for "©", ref will be "copy"
+ # called for each entity reference, e.g. for '©', ref will be 'copy'
# Reconstruct the original entity reference.
- self.pieces.append("&%(ref)s;" % locals())
+ self.pieces.append('&%(ref)s;' % locals())
def handle_data(self, text):
# called for each block of plain text, i.e. outside of any tag and
@@ -1302,19 +1489,19 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
def handle_comment(self, text):
# called for each HTML comment, e.g.
# Reconstruct the original comment.
- self.pieces.append("" % locals())
+ self.pieces.append('' % locals())
def handle_pi(self, text):
# called for each processing instruction, e.g.
# Reconstruct original processing instruction.
- self.pieces.append("%(text)s>" % locals())
+ self.pieces.append('%(text)s>' % locals())
def handle_decl(self, text):
# called for the DOCTYPE, if present, e.g.
#
# Reconstruct original DOCTYPE
- self.pieces.append("" % locals())
+ self.pieces.append('' % locals())
_new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
def _scan_name(self, i, declstartpos):
@@ -1335,8 +1522,8 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
return None, -1
def output(self):
- """Return processed HTML as a single string"""
- return "".join([str(p) for p in self.pieces])
+ '''Return processed HTML as a single string'''
+ return ''.join([str(p) for p in self.pieces])
class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
def __init__(self, baseuri, baselang, encoding):
@@ -1354,7 +1541,7 @@ class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
data = data.replace('"', '"')
data = data.replace(''', ''')
data = data.replace(''', ''')
- if self.contentparams.get('mode') == 'escaped':
+ if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
data = data.replace('<', '<')
data = data.replace('>', '>')
data = data.replace('&', '&')
@@ -1394,7 +1581,7 @@ class _RelativeURIResolver(_BaseHTMLProcessor):
self.baseuri = baseuri
def resolveURI(self, uri):
- return urlparse.urljoin(self.baseuri, uri)
+ return _urljoin(self.baseuri, uri)
def unknown_starttag(self, tag, attrs):
attrs = self.normalize_attrs(attrs)
@@ -1402,7 +1589,7 @@ class _RelativeURIResolver(_BaseHTMLProcessor):
_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
def _resolveRelativeURIs(htmlSource, baseURI, encoding):
- if _debug: sys.stderr.write("entering _resolveRelativeURIs\n")
+ if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
p = _RelativeURIResolver(baseURI, encoding)
p.feed(htmlSource)
return p.output()
@@ -1464,18 +1651,42 @@ def _sanitizeHTML(htmlSource, encoding):
p = _HTMLSanitizer(encoding)
p.feed(htmlSource)
data = p.output()
- if _mxtidy and TIDY_MARKUP:
- nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, output_xhtml=1, numeric_entities=1, wrap=0)
- if data.count(''):
- data = data.split('>', 1)[1]
- if data.count(''):
+ data = data.split('>', 1)[1]
+ if data.count('