fix some bad links

2025-04-28 22:41:08 -05:00 · 2022-04-21 12:53:12 -05:00 · 2022-04-21 12:53:12 -05:00 · ed3d345185
commit ed3d345185
parent b4f11088ec
1 changed files with 79 additions and 63 deletions
--- a/SpiffyTitles/plugin.py
+++ b/SpiffyTitles/plugin.py
@ -266,7 +266,9 @@ class SpiffyTitles(callbacks.Plugin):
        """
        if self.registryValue("cacheGlobal"):
            channel = "global"
-        cached_link = self.get_link_from_cache(url, channel)
+        # cached_link = self.get_link_from_cache(url, channel)
+        log.info("Skipping cache check")
+        cached_link = None
        if cached_link:
            title = cached_link["title"]
        else:
@ -420,6 +422,9 @@ class SpiffyTitles(callbacks.Plugin):
        Get the HTML of a website based on a URL
        """
        max_retries = self.registryValue("maxRetries")
+        size = conf.supybot.protocols.http.peekSize()
+        generic_error = self.registryValue("badLinkText", channel=channel)
+        response_size = 0
        if retries >= max_retries:
            log.debug("SpiffyTitles: hit maximum retries for %s" % url)
            return (None, False)
@ -436,6 +441,14 @@ class SpiffyTitles(callbacks.Plugin):
                stream=True,
                proxies=self.proxies,
            ) as request:
+                for chunk in request.iter_content(chunk_size=1024):
+                    if response_size > size:
+                        request.close()
+                        return (generic_error, False)
+                    if 'content-length' in request.headers:
+                        if int(request.headers['content-length']) > size:
+                            log.debug("SpiffyTitles: URL ignored due to exceeding content size")
+                            return (generic_error, False)
                    request.raise_for_status()
                    if request.history:
                        # check the top two domain levels
@ -458,7 +471,7 @@ class SpiffyTitles(callbacks.Plugin):
                                    " match: %s"
                                    % url
                                )
-                            return
+                                return (None, False)
                            whitelist_pattern = self.registryValue(
                                "whitelistDomainPattern", channel=channel
                            )
@ -471,7 +484,7 @@ class SpiffyTitles(callbacks.Plugin):
                                    " mismatch: %s"
                                    % url
                                )
-                            return
+                                return (None, False)
                            text = self.get_title_by_url(request.url, channel)
                            text = text.lstrip("\x02").lstrip("^").strip()
                            return (text, is_redirect)
@ -480,9 +493,11 @@ class SpiffyTitles(callbacks.Plugin):
                    acceptable_types = self.registryValue("default.mimeTypes")
                    log.debug("SpiffyTitles: content type %s" % (content_type))
                    if content_type in acceptable_types:
-                    text = request.content
-                    if text:
-                        return (self.get_title_from_html(text, channel), is_redirect)
+                        if chunk:
+                            title = self.get_title_from_html(chunk, channel)
+                            if not title:
+                                continue
+                            return (title, is_redirect)
                        else:
                            log.debug("SpiffyTitles: empty content from %s" % (url))
                    else:
@ -500,12 +515,13 @@ class SpiffyTitles(callbacks.Plugin):
                            {"type": content_type, "size": size}
                        )
                        return (text, is_redirect)
+                    response_size += len(chunk)
        except requests.exceptions.MissingSchema as e:
            url_wschema = "http://%s" % (url)
            log.error("SpiffyTitles missing schema. Retrying with %s" % (url_wschema))
            info = urlparse(url_wschema)
            if self.is_ignored_domain(info.netloc, channel):
-                return
+                return (None, False)
            else:
                return self.get_source_by_url(url_wschema, channel)
        except requests.exceptions.Timeout as e: