use oEmbed to check for title before parsing the page

This commit is contained in:
lod 2024-11-20 17:10:18 +01:00 committed by lodriguez
parent e57f7ebc2a
commit ecd42ad004

View File

@ -33,6 +33,7 @@ import re
import sys import sys
import string import string
import socket import socket
import json
import supybot.conf as conf import supybot.conf as conf
import supybot.utils as utils import supybot.utils as utils
@ -144,6 +145,54 @@ class Web(callbacks.PluginRegexp):
regexps = ['titleSnarfer'] regexps = ['titleSnarfer']
threaded = True threaded = True
def __init__(self, irc):
self.__parent = super(Web, self)
self.__parent.__init__(irc)
self.oembed_providers = self._loadOEmbedProviders()
def _loadOEmbedProviders(self):
"""
Loads the oEmbed providers JSON.
"""
try:
providers_url = "https://oembed.com/providers.json"
response = utils.web.getUrl(providers_url)
return json.loads(response)
except Exception as e:
self.log.debug(f"Failed to load oEmbed providers: {e}")
return []
def _getOEmbedEndpoint(self, url):
"""
Finds the appropriate oEmbed endpoint for the given URL based on providers.json.
"""
for provider in self.oembed_providers:
for pattern in provider.get('endpoints', []):
schemes = pattern.get('schemes', [])
endpoint = pattern.get('url', '')
for scheme in schemes:
regex = re.escape(scheme).replace(r'\*', '.*') # Convert wildcard to regex
if re.match(regex, url):
return endpoint
return None
def getOEmbedTitle(self, url):
"""
Retrieves the oEmbed title using the providers JSON.
"""
try:
oembed_endpoint = self._getOEmbedEndpoint(url)
if not oembed_endpoint:
return None
oembed_url = f"{oembed_endpoint}?format=json&url={url}"
response = utils.web.getUrl(oembed_url)
oembed_data = json.loads(response)
return oembed_data.get('title')
except Exception as e:
self.log.debug(f"Failed to retrieve oEmbed title: {e}")
return None
def noIgnore(self, irc, msg): def noIgnore(self, irc, msg):
return not self.registryValue('checkIgnored', msg.channel, irc.network) return not self.registryValue('checkIgnored', msg.channel, irc.network)
@ -280,10 +329,13 @@ class Web(callbacks.PluginRegexp):
if r and r.search(url): if r and r.search(url):
self.log.debug('Not titleSnarfing %q.', url) self.log.debug('Not titleSnarfing %q.', url)
return return
r = self.getTitle(irc, url, False, msg) title = self.getOEmbedTitle(url)
if not r: target = url
return if not title:
(target, title) = r r = self.getTitle(irc, url, False, msg)
if not r:
return
(target, title) = r
if title: if title:
domain = utils.web.getDomain(target domain = utils.web.getDomain(target
if self.registryValue('snarferShowTargetDomain', if self.registryValue('snarferShowTargetDomain',