From c4dfa55d6512290ad5d11e29f2470b57e6a36f39 Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Sat, 4 Aug 2012 18:02:45 +0200 Subject: [PATCH] Use HTMLParser instead of deprecated sgmllib in utils.web. --- src/utils/web.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/utils/web.py b/src/utils/web.py index 8935f7ef5..949f8efe6 100644 --- a/src/utils/web.py +++ b/src/utils/web.py @@ -29,13 +29,14 @@ ### import re +import sys import socket import urllib import urllib2 import httplib -import sgmllib import urlparse import htmlentitydefs +from HTMLParser import HTMLParser sockerrors = (socket.error,) try: @@ -150,19 +151,19 @@ def getUrl(url, size=None, headers=None, data=None): def getDomain(url): return urlparse.urlparse(url)[1] -class HtmlToText(sgmllib.SGMLParser): +class HtmlToText(HTMLParser, object): """Taken from some eff-bot code on c.l.p.""" entitydefs = htmlentitydefs.entitydefs.copy() entitydefs['nbsp'] = ' ' def __init__(self, tagReplace=' '): self.data = [] self.tagReplace = tagReplace - sgmllib.SGMLParser.__init__(self) + super(HtmlToText, self).__init__() - def unknown_starttag(self, tag, attr): + def handle_starttag(self, tag, attr): self.data.append(self.tagReplace) - def unknown_endtag(self, tag): + def handle_endtag(self, tag): self.data.append(self.tagReplace) def handle_data(self, data): @@ -175,6 +176,8 @@ class HtmlToText(sgmllib.SGMLParser): def htmlToText(s, tagReplace=' '): """Turns HTML into text. tagReplace is a string to replace HTML tags with. """ + if sys.version_info[0] >= 3 and isinstance(s, bytes): + s = s.decode() x = HtmlToText(tagReplace) x.feed(s) return x.getText()