Web: Add workaround for minified Reddit URLs

This commit is contained in:
Valentin Lorentz 2024-10-14 08:34:51 +02:00
parent 2aa1f916b0
commit b13ebebc83
2 changed files with 38 additions and 10 deletions

View File

@ -150,22 +150,47 @@ class Web(callbacks.PluginRegexp):
def getTitle(self, irc, url, raiseErrors, msg):
size = conf.supybot.protocols.http.peekSize()
parsed_url = utils.web.urlparse(url)
if parsed_url.netloc in ('youtube.com', 'youtu.be') \
or parsed_url.netloc.endswith(('.youtube.com')):
# there is a lot of Javascript before the <title>
size = max(819200, size)
if parsed_url.netloc in ('reddit.com', 'www.reddit.com', 'new.reddit.com'):
# Since 2022-03, New Reddit has 'Reddit - Dive into anything' as
# <title> on every page.
parsed_url = parsed_url._replace(netloc='old.reddit.com')
url = utils.web.urlunparse(parsed_url)
def url_workaround(url):
"""Returns a new URL that should be the target of a new request,
or None if the request is fine as it is.
The returned URL may be the same as the parameter, in case
something else was changed by this function through side-effects.
"""
nonlocal size
parsed_url = utils.web.urlparse(url)
print(repr(parsed_url.netloc))
if parsed_url.netloc in ('youtube.com', 'youtu.be') \
or parsed_url.netloc.endswith(('.youtube.com')):
# there is a lot of Javascript before the <title>
if size < 819200:
size = max(819200, size)
return url
else:
return None
if parsed_url.netloc in ('reddit.com', 'www.reddit.com', 'new.reddit.com'):
# Since 2022-03, New Reddit has 'Reddit - Dive into anything' as
# <title> on every page.
parsed_url = parsed_url._replace(netloc='old.reddit.com')
url = utils.web.urlunparse(parsed_url)
self.log.debug("Rewrite URL to %s", url)
return url
return None
url = url_workaround(url) or url
timeout = self.registryValue('timeout')
headers = conf.defaultHttpHeaders(irc.network, msg.channel)
try:
fd = utils.web.getUrlFd(url, timeout=timeout, headers=headers)
target = fd.geturl()
fixed_target = url_workaround(target)
if fixed_target is not None:
# happens when using minification services linking to one of
# the websites handled by url_workaround; eg. v.redd.it
fd.close()
fd = utils.web.getUrlFd(fixed_target, timeout=timeout, headers=headers)
target = fd.geturl()
text = fd.read(size)
response_headers = fd.headers
fd.close()

View File

@ -84,6 +84,9 @@ class WebTestCase(ChannelPluginTestCase):
self.assertRegexp(
'title https://www.reddit.com/r/irc/',
'Internet Relay Chat')
self.assertRegexp(
'title https://v.redd.it/odhemxo6giud1',
'Small Kitty Big Goals : MadeMeSmile')
def testTitleMarcinfo(self):
# Checks that we don't crash on 'Content-Type: text/html;'