fix some bad links

This commit is contained in:
cottongin 2022-04-21 12:53:12 -05:00
parent b4f11088ec
commit ed3d345185
No known key found for this signature in database
GPG Key ID: C0D8954A47DCB4ED

View File

@ -266,7 +266,9 @@ class SpiffyTitles(callbacks.Plugin):
""" """
if self.registryValue("cacheGlobal"): if self.registryValue("cacheGlobal"):
channel = "global" channel = "global"
cached_link = self.get_link_from_cache(url, channel) # cached_link = self.get_link_from_cache(url, channel)
log.info("Skipping cache check")
cached_link = None
if cached_link: if cached_link:
title = cached_link["title"] title = cached_link["title"]
else: else:
@ -420,6 +422,9 @@ class SpiffyTitles(callbacks.Plugin):
Get the HTML of a website based on a URL Get the HTML of a website based on a URL
""" """
max_retries = self.registryValue("maxRetries") max_retries = self.registryValue("maxRetries")
size = conf.supybot.protocols.http.peekSize()
generic_error = self.registryValue("badLinkText", channel=channel)
response_size = 0
if retries >= max_retries: if retries >= max_retries:
log.debug("SpiffyTitles: hit maximum retries for %s" % url) log.debug("SpiffyTitles: hit maximum retries for %s" % url)
return (None, False) return (None, False)
@ -436,76 +441,87 @@ class SpiffyTitles(callbacks.Plugin):
stream=True, stream=True,
proxies=self.proxies, proxies=self.proxies,
) as request: ) as request:
request.raise_for_status() for chunk in request.iter_content(chunk_size=1024):
if request.history: if response_size > size:
# check the top two domain levels request.close()
link_domain = self.get_base_domain(request.history[0].url) return (generic_error, False)
real_domain = self.get_base_domain(request.url) if 'content-length' in request.headers:
if link_domain != real_domain: if int(request.headers['content-length']) > size:
is_redirect = True log.debug("SpiffyTitles: URL ignored due to exceeding content size")
for redir in request.history: return (generic_error, False)
log.debug( request.raise_for_status()
"SpiffyTitles: Redirect %s from %s" if request.history:
% (redir.status_code, redir.url) # check the top two domain levels
link_domain = self.get_base_domain(request.history[0].url)
real_domain = self.get_base_domain(request.url)
if link_domain != real_domain:
is_redirect = True
for redir in request.history:
log.debug(
"SpiffyTitles: Redirect %s from %s"
% (redir.status_code, redir.url)
)
log.debug("SpiffyTitles: Final url %s" % (request.url))
info = urlparse(request.url)
domain = info.netloc
is_ignored = self.is_ignored_domain(domain, channel)
if is_ignored:
log.debug(
"SpiffyTitles: URL ignored due to domain blacklist"
" match: %s"
% url
)
return (None, False)
whitelist_pattern = self.registryValue(
"whitelistDomainPattern", channel=channel
) )
log.debug("SpiffyTitles: Final url %s" % (request.url)) is_whitelisted_domain = self.is_whitelisted_domain(
info = urlparse(request.url) domain, channel
domain = info.netloc
is_ignored = self.is_ignored_domain(domain, channel)
if is_ignored:
log.debug(
"SpiffyTitles: URL ignored due to domain blacklist"
" match: %s"
% url
) )
return if whitelist_pattern and not is_whitelisted_domain:
whitelist_pattern = self.registryValue( log.debug(
"whitelistDomainPattern", channel=channel "SpiffyTitles: URL ignored due to domain whitelist"
) " mismatch: %s"
is_whitelisted_domain = self.is_whitelisted_domain( % url
domain, channel )
) return (None, False)
if whitelist_pattern and not is_whitelisted_domain: text = self.get_title_by_url(request.url, channel)
log.debug( text = text.lstrip("\x02").lstrip("^").strip()
"SpiffyTitles: URL ignored due to domain whitelist" return (text, is_redirect)
" mismatch: %s" # Check the content type
% url content_type = request.headers.get("content-type").split(";")[0].strip()
) acceptable_types = self.registryValue("default.mimeTypes")
return log.debug("SpiffyTitles: content type %s" % (content_type))
text = self.get_title_by_url(request.url, channel) if content_type in acceptable_types:
text = text.lstrip("\x02").lstrip("^").strip() if chunk:
return (text, is_redirect) title = self.get_title_from_html(chunk, channel)
# Check the content type if not title:
content_type = request.headers.get("content-type").split(";")[0].strip() continue
acceptable_types = self.registryValue("default.mimeTypes") return (title, is_redirect)
log.debug("SpiffyTitles: content type %s" % (content_type)) else:
if content_type in acceptable_types: log.debug("SpiffyTitles: empty content from %s" % (url))
text = request.content
if text:
return (self.get_title_from_html(text, channel), is_redirect)
else: else:
log.debug("SpiffyTitles: empty content from %s" % (url)) log.debug(
else: "SpiffyTitles: unacceptable mime type %s for url %s"
log.debug( % (content_type, url)
"SpiffyTitles: unacceptable mime type %s for url %s" )
% (content_type, url) size = request.headers.get("content-length")
) if size:
size = request.headers.get("content-length") size = self.get_readable_file_size(int(size))
if size: file_template = self.registryValue(
size = self.get_readable_file_size(int(size)) "default.fileTemplate", channel=channel
file_template = self.registryValue( )
"default.fileTemplate", channel=channel text = Template(file_template).render(
) {"type": content_type, "size": size}
text = Template(file_template).render( )
{"type": content_type, "size": size} return (text, is_redirect)
) response_size += len(chunk)
return (text, is_redirect)
except requests.exceptions.MissingSchema as e: except requests.exceptions.MissingSchema as e:
url_wschema = "http://%s" % (url) url_wschema = "http://%s" % (url)
log.error("SpiffyTitles missing schema. Retrying with %s" % (url_wschema)) log.error("SpiffyTitles missing schema. Retrying with %s" % (url_wschema))
info = urlparse(url_wschema) info = urlparse(url_wschema)
if self.is_ignored_domain(info.netloc, channel): if self.is_ignored_domain(info.netloc, channel):
return return (None, False)
else: else:
return self.get_source_by_url(url_wschema, channel) return self.get_source_by_url(url_wschema, channel)
except requests.exceptions.Timeout as e: except requests.exceptions.Timeout as e: