mirror of
https://github.com/oddluck/limnoria-plugins.git
synced 2025-04-29 06:51:12 -05:00
fix some bad links
This commit is contained in:
parent
b4f11088ec
commit
ed3d345185
@ -266,7 +266,9 @@ class SpiffyTitles(callbacks.Plugin):
|
|||||||
"""
|
"""
|
||||||
if self.registryValue("cacheGlobal"):
|
if self.registryValue("cacheGlobal"):
|
||||||
channel = "global"
|
channel = "global"
|
||||||
cached_link = self.get_link_from_cache(url, channel)
|
# cached_link = self.get_link_from_cache(url, channel)
|
||||||
|
log.info("Skipping cache check")
|
||||||
|
cached_link = None
|
||||||
if cached_link:
|
if cached_link:
|
||||||
title = cached_link["title"]
|
title = cached_link["title"]
|
||||||
else:
|
else:
|
||||||
@ -420,6 +422,9 @@ class SpiffyTitles(callbacks.Plugin):
|
|||||||
Get the HTML of a website based on a URL
|
Get the HTML of a website based on a URL
|
||||||
"""
|
"""
|
||||||
max_retries = self.registryValue("maxRetries")
|
max_retries = self.registryValue("maxRetries")
|
||||||
|
size = conf.supybot.protocols.http.peekSize()
|
||||||
|
generic_error = self.registryValue("badLinkText", channel=channel)
|
||||||
|
response_size = 0
|
||||||
if retries >= max_retries:
|
if retries >= max_retries:
|
||||||
log.debug("SpiffyTitles: hit maximum retries for %s" % url)
|
log.debug("SpiffyTitles: hit maximum retries for %s" % url)
|
||||||
return (None, False)
|
return (None, False)
|
||||||
@ -436,76 +441,87 @@ class SpiffyTitles(callbacks.Plugin):
|
|||||||
stream=True,
|
stream=True,
|
||||||
proxies=self.proxies,
|
proxies=self.proxies,
|
||||||
) as request:
|
) as request:
|
||||||
request.raise_for_status()
|
for chunk in request.iter_content(chunk_size=1024):
|
||||||
if request.history:
|
if response_size > size:
|
||||||
# check the top two domain levels
|
request.close()
|
||||||
link_domain = self.get_base_domain(request.history[0].url)
|
return (generic_error, False)
|
||||||
real_domain = self.get_base_domain(request.url)
|
if 'content-length' in request.headers:
|
||||||
if link_domain != real_domain:
|
if int(request.headers['content-length']) > size:
|
||||||
is_redirect = True
|
log.debug("SpiffyTitles: URL ignored due to exceeding content size")
|
||||||
for redir in request.history:
|
return (generic_error, False)
|
||||||
log.debug(
|
request.raise_for_status()
|
||||||
"SpiffyTitles: Redirect %s from %s"
|
if request.history:
|
||||||
% (redir.status_code, redir.url)
|
# check the top two domain levels
|
||||||
|
link_domain = self.get_base_domain(request.history[0].url)
|
||||||
|
real_domain = self.get_base_domain(request.url)
|
||||||
|
if link_domain != real_domain:
|
||||||
|
is_redirect = True
|
||||||
|
for redir in request.history:
|
||||||
|
log.debug(
|
||||||
|
"SpiffyTitles: Redirect %s from %s"
|
||||||
|
% (redir.status_code, redir.url)
|
||||||
|
)
|
||||||
|
log.debug("SpiffyTitles: Final url %s" % (request.url))
|
||||||
|
info = urlparse(request.url)
|
||||||
|
domain = info.netloc
|
||||||
|
is_ignored = self.is_ignored_domain(domain, channel)
|
||||||
|
if is_ignored:
|
||||||
|
log.debug(
|
||||||
|
"SpiffyTitles: URL ignored due to domain blacklist"
|
||||||
|
" match: %s"
|
||||||
|
% url
|
||||||
|
)
|
||||||
|
return (None, False)
|
||||||
|
whitelist_pattern = self.registryValue(
|
||||||
|
"whitelistDomainPattern", channel=channel
|
||||||
)
|
)
|
||||||
log.debug("SpiffyTitles: Final url %s" % (request.url))
|
is_whitelisted_domain = self.is_whitelisted_domain(
|
||||||
info = urlparse(request.url)
|
domain, channel
|
||||||
domain = info.netloc
|
|
||||||
is_ignored = self.is_ignored_domain(domain, channel)
|
|
||||||
if is_ignored:
|
|
||||||
log.debug(
|
|
||||||
"SpiffyTitles: URL ignored due to domain blacklist"
|
|
||||||
" match: %s"
|
|
||||||
% url
|
|
||||||
)
|
)
|
||||||
return
|
if whitelist_pattern and not is_whitelisted_domain:
|
||||||
whitelist_pattern = self.registryValue(
|
log.debug(
|
||||||
"whitelistDomainPattern", channel=channel
|
"SpiffyTitles: URL ignored due to domain whitelist"
|
||||||
)
|
" mismatch: %s"
|
||||||
is_whitelisted_domain = self.is_whitelisted_domain(
|
% url
|
||||||
domain, channel
|
)
|
||||||
)
|
return (None, False)
|
||||||
if whitelist_pattern and not is_whitelisted_domain:
|
text = self.get_title_by_url(request.url, channel)
|
||||||
log.debug(
|
text = text.lstrip("\x02").lstrip("^").strip()
|
||||||
"SpiffyTitles: URL ignored due to domain whitelist"
|
return (text, is_redirect)
|
||||||
" mismatch: %s"
|
# Check the content type
|
||||||
% url
|
content_type = request.headers.get("content-type").split(";")[0].strip()
|
||||||
)
|
acceptable_types = self.registryValue("default.mimeTypes")
|
||||||
return
|
log.debug("SpiffyTitles: content type %s" % (content_type))
|
||||||
text = self.get_title_by_url(request.url, channel)
|
if content_type in acceptable_types:
|
||||||
text = text.lstrip("\x02").lstrip("^").strip()
|
if chunk:
|
||||||
return (text, is_redirect)
|
title = self.get_title_from_html(chunk, channel)
|
||||||
# Check the content type
|
if not title:
|
||||||
content_type = request.headers.get("content-type").split(";")[0].strip()
|
continue
|
||||||
acceptable_types = self.registryValue("default.mimeTypes")
|
return (title, is_redirect)
|
||||||
log.debug("SpiffyTitles: content type %s" % (content_type))
|
else:
|
||||||
if content_type in acceptable_types:
|
log.debug("SpiffyTitles: empty content from %s" % (url))
|
||||||
text = request.content
|
|
||||||
if text:
|
|
||||||
return (self.get_title_from_html(text, channel), is_redirect)
|
|
||||||
else:
|
else:
|
||||||
log.debug("SpiffyTitles: empty content from %s" % (url))
|
log.debug(
|
||||||
else:
|
"SpiffyTitles: unacceptable mime type %s for url %s"
|
||||||
log.debug(
|
% (content_type, url)
|
||||||
"SpiffyTitles: unacceptable mime type %s for url %s"
|
)
|
||||||
% (content_type, url)
|
size = request.headers.get("content-length")
|
||||||
)
|
if size:
|
||||||
size = request.headers.get("content-length")
|
size = self.get_readable_file_size(int(size))
|
||||||
if size:
|
file_template = self.registryValue(
|
||||||
size = self.get_readable_file_size(int(size))
|
"default.fileTemplate", channel=channel
|
||||||
file_template = self.registryValue(
|
)
|
||||||
"default.fileTemplate", channel=channel
|
text = Template(file_template).render(
|
||||||
)
|
{"type": content_type, "size": size}
|
||||||
text = Template(file_template).render(
|
)
|
||||||
{"type": content_type, "size": size}
|
return (text, is_redirect)
|
||||||
)
|
response_size += len(chunk)
|
||||||
return (text, is_redirect)
|
|
||||||
except requests.exceptions.MissingSchema as e:
|
except requests.exceptions.MissingSchema as e:
|
||||||
url_wschema = "http://%s" % (url)
|
url_wschema = "http://%s" % (url)
|
||||||
log.error("SpiffyTitles missing schema. Retrying with %s" % (url_wschema))
|
log.error("SpiffyTitles missing schema. Retrying with %s" % (url_wschema))
|
||||||
info = urlparse(url_wschema)
|
info = urlparse(url_wschema)
|
||||||
if self.is_ignored_domain(info.netloc, channel):
|
if self.is_ignored_domain(info.netloc, channel):
|
||||||
return
|
return (None, False)
|
||||||
else:
|
else:
|
||||||
return self.get_source_by_url(url_wschema, channel)
|
return self.get_source_by_url(url_wschema, channel)
|
||||||
except requests.exceptions.Timeout as e:
|
except requests.exceptions.Timeout as e:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user