fix some bad links

2025-04-29 06:51:12 -05:00 · 2022-04-21 12:53:12 -05:00 · 2022-04-21 12:53:12 -05:00 · ed3d345185
commit ed3d345185
parent b4f11088ec
1 changed files with 79 additions and 63 deletions
--- a/SpiffyTitles/plugin.py
+++ b/SpiffyTitles/plugin.py
@ -266,7 +266,9 @@ class SpiffyTitles(callbacks.Plugin):
        """
        if self.registryValue("cacheGlobal"):
            channel = "global"
-        cached_link = self.get_link_from_cache(url, channel)
+        # cached_link = self.get_link_from_cache(url, channel)
        log.info("Skipping cache check")
        cached_link = None
        if cached_link:
            title = cached_link["title"]
        else:
@ -420,6 +422,9 @@ class SpiffyTitles(callbacks.Plugin):
        Get the HTML of a website based on a URL
        """
        max_retries = self.registryValue("maxRetries")
        size = conf.supybot.protocols.http.peekSize()
        generic_error = self.registryValue("badLinkText", channel=channel)
        response_size = 0
        if retries >= max_retries:
            log.debug("SpiffyTitles: hit maximum retries for %s" % url)
            return (None, False)
@ -436,76 +441,87 @@ class SpiffyTitles(callbacks.Plugin):
                stream=True,
                proxies=self.proxies,
            ) as request:
-                request.raise_for_status()
+                for chunk in request.iter_content(chunk_size=1024):
-                if request.history:
+                    if response_size > size:
-                    # check the top two domain levels
+                        request.close()
-                    link_domain = self.get_base_domain(request.history[0].url)
+                        return (generic_error, False)
-                    real_domain = self.get_base_domain(request.url)
+                    if 'content-length' in request.headers:
-                    if link_domain != real_domain:
+                        if int(request.headers['content-length']) > size:
-                        is_redirect = True
+                            log.debug("SpiffyTitles: URL ignored due to exceeding content size")
-                        for redir in request.history:
+                            return (generic_error, False)
-                            log.debug(
+                    request.raise_for_status()
-                                "SpiffyTitles: Redirect %s from %s"
+                    if request.history:
-                                % (redir.status_code, redir.url)
+                        # check the top two domain levels
                        link_domain = self.get_base_domain(request.history[0].url)
                        real_domain = self.get_base_domain(request.url)
                        if link_domain != real_domain:
                            is_redirect = True
                            for redir in request.history:
                                log.debug(
                                    "SpiffyTitles: Redirect %s from %s"
                                    % (redir.status_code, redir.url)
                                )
                            log.debug("SpiffyTitles: Final url %s" % (request.url))
                            info = urlparse(request.url)
                            domain = info.netloc
                            is_ignored = self.is_ignored_domain(domain, channel)
                            if is_ignored:
                                log.debug(
                                    "SpiffyTitles: URL ignored due to domain blacklist"
                                    " match: %s"
                                    % url
                                )
                                return (None, False)
                            whitelist_pattern = self.registryValue(
                                "whitelistDomainPattern", channel=channel
                            )
-                        log.debug("SpiffyTitles: Final url %s" % (request.url))
+                            is_whitelisted_domain = self.is_whitelisted_domain(
-                        info = urlparse(request.url)
+                                domain, channel
                        domain = info.netloc
                        is_ignored = self.is_ignored_domain(domain, channel)
                        if is_ignored:
                            log.debug(
                                "SpiffyTitles: URL ignored due to domain blacklist"
                                " match: %s"
                                % url
                            )
-                            return
+                            if whitelist_pattern and not is_whitelisted_domain:
-                        whitelist_pattern = self.registryValue(
+                                log.debug(
-                            "whitelistDomainPattern", channel=channel
+                                    "SpiffyTitles: URL ignored due to domain whitelist"
-                        )
+                                    " mismatch: %s"
-                        is_whitelisted_domain = self.is_whitelisted_domain(
+                                    % url
-                            domain, channel
+                                )
-                        )
+                                return (None, False)
-                        if whitelist_pattern and not is_whitelisted_domain:
+                            text = self.get_title_by_url(request.url, channel)
-                            log.debug(
+                            text = text.lstrip("\x02").lstrip("^").strip()
-                                "SpiffyTitles: URL ignored due to domain whitelist"
+                            return (text, is_redirect)
-                                " mismatch: %s"
+                    # Check the content type
-                                % url
+                    content_type = request.headers.get("content-type").split(";")[0].strip()
-                            )
+                    acceptable_types = self.registryValue("default.mimeTypes")
-                            return
+                    log.debug("SpiffyTitles: content type %s" % (content_type))
-                        text = self.get_title_by_url(request.url, channel)
+                    if content_type in acceptable_types:
-                        text = text.lstrip("\x02").lstrip("^").strip()
+                        if chunk:
-                        return (text, is_redirect)
+                            title = self.get_title_from_html(chunk, channel)
-                # Check the content type
+                            if not title:
-                content_type = request.headers.get("content-type").split(";")[0].strip()
+                                continue
-                acceptable_types = self.registryValue("default.mimeTypes")
+                            return (title, is_redirect)
-                log.debug("SpiffyTitles: content type %s" % (content_type))
+                        else:
-                if content_type in acceptable_types:
+                            log.debug("SpiffyTitles: empty content from %s" % (url))
                    text = request.content
                    if text:
                        return (self.get_title_from_html(text, channel), is_redirect)
                    else:
-                        log.debug("SpiffyTitles: empty content from %s" % (url))
+                        log.debug(
-                else:
+                            "SpiffyTitles: unacceptable mime type %s for url %s"
-                    log.debug(
+                            % (content_type, url)
-                        "SpiffyTitles: unacceptable mime type %s for url %s"
+                        )
-                        % (content_type, url)
+                        size = request.headers.get("content-length")
-                    )
+                        if size:
-                    size = request.headers.get("content-length")
+                            size = self.get_readable_file_size(int(size))
-                    if size:
+                        file_template = self.registryValue(
-                        size = self.get_readable_file_size(int(size))
+                            "default.fileTemplate", channel=channel
-                    file_template = self.registryValue(
+                        )
-                        "default.fileTemplate", channel=channel
+                        text = Template(file_template).render(
-                    )
+                            {"type": content_type, "size": size}
-                    text = Template(file_template).render(
+                        )
-                        {"type": content_type, "size": size}
+                        return (text, is_redirect)
-                    )
+                    response_size += len(chunk)
                    return (text, is_redirect)
        except requests.exceptions.MissingSchema as e:
            url_wschema = "http://%s" % (url)
            log.error("SpiffyTitles missing schema. Retrying with %s" % (url_wschema))
            info = urlparse(url_wschema)
            if self.is_ignored_domain(info.netloc, channel):
-                return
+                return (None, False)
            else:
                return self.get_source_by_url(url_wschema, channel)
        except requests.exceptions.Timeout as e: