mirror of
https://github.com/progval/Limnoria.git
synced 2025-04-25 20:41:18 -05:00
Web: Add workaround for minified Reddit URLs
This commit is contained in:
parent
2aa1f916b0
commit
b13ebebc83
@ -150,22 +150,47 @@ class Web(callbacks.PluginRegexp):
|
|||||||
def getTitle(self, irc, url, raiseErrors, msg):
|
def getTitle(self, irc, url, raiseErrors, msg):
|
||||||
size = conf.supybot.protocols.http.peekSize()
|
size = conf.supybot.protocols.http.peekSize()
|
||||||
|
|
||||||
|
def url_workaround(url):
|
||||||
|
"""Returns a new URL that should be the target of a new request,
|
||||||
|
or None if the request is fine as it is.
|
||||||
|
|
||||||
|
The returned URL may be the same as the parameter, in case
|
||||||
|
something else was changed by this function through side-effects.
|
||||||
|
"""
|
||||||
|
nonlocal size
|
||||||
parsed_url = utils.web.urlparse(url)
|
parsed_url = utils.web.urlparse(url)
|
||||||
|
print(repr(parsed_url.netloc))
|
||||||
if parsed_url.netloc in ('youtube.com', 'youtu.be') \
|
if parsed_url.netloc in ('youtube.com', 'youtu.be') \
|
||||||
or parsed_url.netloc.endswith(('.youtube.com')):
|
or parsed_url.netloc.endswith(('.youtube.com')):
|
||||||
# there is a lot of Javascript before the <title>
|
# there is a lot of Javascript before the <title>
|
||||||
|
if size < 819200:
|
||||||
size = max(819200, size)
|
size = max(819200, size)
|
||||||
|
return url
|
||||||
|
else:
|
||||||
|
return None
|
||||||
if parsed_url.netloc in ('reddit.com', 'www.reddit.com', 'new.reddit.com'):
|
if parsed_url.netloc in ('reddit.com', 'www.reddit.com', 'new.reddit.com'):
|
||||||
# Since 2022-03, New Reddit has 'Reddit - Dive into anything' as
|
# Since 2022-03, New Reddit has 'Reddit - Dive into anything' as
|
||||||
# <title> on every page.
|
# <title> on every page.
|
||||||
parsed_url = parsed_url._replace(netloc='old.reddit.com')
|
parsed_url = parsed_url._replace(netloc='old.reddit.com')
|
||||||
url = utils.web.urlunparse(parsed_url)
|
url = utils.web.urlunparse(parsed_url)
|
||||||
|
self.log.debug("Rewrite URL to %s", url)
|
||||||
|
return url
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
url = url_workaround(url) or url
|
||||||
timeout = self.registryValue('timeout')
|
timeout = self.registryValue('timeout')
|
||||||
headers = conf.defaultHttpHeaders(irc.network, msg.channel)
|
headers = conf.defaultHttpHeaders(irc.network, msg.channel)
|
||||||
try:
|
try:
|
||||||
fd = utils.web.getUrlFd(url, timeout=timeout, headers=headers)
|
fd = utils.web.getUrlFd(url, timeout=timeout, headers=headers)
|
||||||
target = fd.geturl()
|
target = fd.geturl()
|
||||||
|
fixed_target = url_workaround(target)
|
||||||
|
if fixed_target is not None:
|
||||||
|
# happens when using minification services linking to one of
|
||||||
|
# the websites handled by url_workaround; eg. v.redd.it
|
||||||
|
fd.close()
|
||||||
|
fd = utils.web.getUrlFd(fixed_target, timeout=timeout, headers=headers)
|
||||||
|
target = fd.geturl()
|
||||||
text = fd.read(size)
|
text = fd.read(size)
|
||||||
response_headers = fd.headers
|
response_headers = fd.headers
|
||||||
fd.close()
|
fd.close()
|
||||||
|
@ -84,6 +84,9 @@ class WebTestCase(ChannelPluginTestCase):
|
|||||||
self.assertRegexp(
|
self.assertRegexp(
|
||||||
'title https://www.reddit.com/r/irc/',
|
'title https://www.reddit.com/r/irc/',
|
||||||
'Internet Relay Chat')
|
'Internet Relay Chat')
|
||||||
|
self.assertRegexp(
|
||||||
|
'title https://v.redd.it/odhemxo6giud1',
|
||||||
|
'Small Kitty Big Goals : MadeMeSmile')
|
||||||
|
|
||||||
def testTitleMarcinfo(self):
|
def testTitleMarcinfo(self):
|
||||||
# Checks that we don't crash on 'Content-Type: text/html;'
|
# Checks that we don't crash on 'Content-Type: text/html;'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user