Web: Add workaround for minified Reddit URLs

2025-04-25 20:41:18 -05:00 · 2024-10-14 08:34:51 +02:00 · 2024-10-14 08:34:51 +02:00 · b13ebebc83
commit b13ebebc83
parent 2aa1f916b0
2 changed files with 38 additions and 10 deletions
--- a/plugins/Web/plugin.py
+++ b/plugins/Web/plugin.py
@ -150,22 +150,47 @@ class Web(callbacks.PluginRegexp):
    def getTitle(self, irc, url, raiseErrors, msg):
        size = conf.supybot.protocols.http.peekSize()
        def url_workaround(url):
            """Returns a new URL that should be the target of a new request,
            or None if the request is fine as it is.
            The returned URL may be the same as the parameter, in case
            something else was changed by this function through side-effects.
            """
            nonlocal size
            parsed_url = utils.web.urlparse(url)
            print(repr(parsed_url.netloc))
            if parsed_url.netloc in ('youtube.com', 'youtu.be') \
                    or parsed_url.netloc.endswith(('.youtube.com')):
                # there is a lot of Javascript before the <title>
                if size < 819200:
                    size = max(819200, size)
                    return url
                else:
                    return None
            if parsed_url.netloc in ('reddit.com', 'www.reddit.com', 'new.reddit.com'):
                # Since 2022-03, New Reddit has 'Reddit - Dive into anything' as
                # <title> on every page.
                parsed_url = parsed_url._replace(netloc='old.reddit.com')
                url = utils.web.urlunparse(parsed_url)
                self.log.debug("Rewrite URL to %s", url)
                return url
            return None
        url = url_workaround(url) or url
        timeout = self.registryValue('timeout')
        headers = conf.defaultHttpHeaders(irc.network, msg.channel)
        try:
            fd = utils.web.getUrlFd(url, timeout=timeout, headers=headers)
            target = fd.geturl()
            fixed_target = url_workaround(target)
            if fixed_target is not None:
                # happens when using minification services linking to one of
                # the websites handled by url_workaround; eg. v.redd.it
                fd.close()
                fd = utils.web.getUrlFd(fixed_target, timeout=timeout, headers=headers)
                target = fd.geturl()
            text = fd.read(size)
            response_headers = fd.headers
            fd.close()
--- a/plugins/Web/test.py
+++ b/plugins/Web/test.py
@ -84,6 +84,9 @@ class WebTestCase(ChannelPluginTestCase):
            self.assertRegexp(
                'title https://www.reddit.com/r/irc/',
                'Internet Relay Chat')
            self.assertRegexp(
                'title https://v.redd.it/odhemxo6giud1',
                'Small Kitty Big Goals : MadeMeSmile')
        def testTitleMarcinfo(self):
            # Checks that we don't crash on 'Content-Type: text/html;'