Wikifetch: rewrite plugin using MediaWiki API

This commit is contained in:
James Lu 2023-02-12 16:07:47 -08:00
parent 31c97a646b
commit 582a02f74e
5 changed files with 102 additions and 307 deletions

View File

@ -77,5 +77,5 @@ Most of these plugins also have their own READMEs in their folders; you can usua
- Translates text through Google Translate multiple times in order to get amusing results. - Translates text through Google Translate multiple times in order to get amusing results.
### Wikifetch ### Wikifetch
- Fork of [ProgVal's Wikipedia plugin](https://github.com/ProgVal/Supybot-plugins), with support for other wikis (via a `--site` option) and other improvements. - Fetch content from MediaWiki-powered sites (Wikipedia, Fandom)
- **Requires:** [lxml](https://lxml.de/installation.html) - **Requires:** [Beautiful Soup 4](http://www.crummy.com/software/BeautifulSoup/bs4/doc/), [mwparserfromhell](https://mwparserfromhell.readthedocs.io/)

View File

@ -1,7 +1,5 @@
### ###
# Copyright (c) 2010, quantumlemur # Copyright (c) 2023 James Lu
# Copyright (c) 2011, Valentin Lorentz
# Copyright (c) 2015, James Lu
# All rights reserved. # All rights reserved.
# #
# Redistribution and use in source and binary forms, with or without # Redistribution and use in source and binary forms, with or without
@ -30,11 +28,9 @@
### ###
import supybot.conf as conf from supybot import conf, registry
import supybot.registry as registry
try: try:
from supybot.i18n import PluginInternationalization from supybot.i18n import PluginInternationalization, internationalizeDocstring
from supybot.i18n import internationalizeDocstring
_ = PluginInternationalization('Wikifetch') _ = PluginInternationalization('Wikifetch')
except: except:
# This are useless functions that's allow to run the plugin on a bot # This are useless functions that's allow to run the plugin on a bot
@ -50,10 +46,8 @@ def configure(advanced):
from supybot.questions import expect, anything, something, yn from supybot.questions import expect, anything, something, yn
conf.registerPlugin('Wikifetch', True) conf.registerPlugin('Wikifetch', True)
Wikifetch = conf.registerPlugin('Wikifetch') Wikifetch = conf.registerPlugin('Wikifetch')
conf.registerChannelValue(Wikifetch, 'url',
registry.String(_('en.wikipedia.org'), _("""Default URL of the
website to pull from.""")))
# vim:set shiftwidth=4 tabstop=4 expandtab textwidth=79: conf.registerGroup(Wikifetch, 'wikipedia')
conf.registerChannelValue(Wikifetch.wikipedia, 'lang',
registry.String('en', _("""Default Wikipedia language""")))

View File

@ -1,7 +1,7 @@
### ###
# Copyright (c) 2010, quantumlemur # Copyright (c) 2010, quantumlemur
# Copyright (c) 2011, Valentin Lorentz # Copyright (c) 2011, Valentin Lorentz
# Copyright (c) 2015,2017 James Lu <james@overdrivenetworks.com> # Copyright (c) 2015-2023 James Lu
# All rights reserved. # All rights reserved.
# #
# Redistribution and use in source and binary forms, with or without # Redistribution and use in source and binary forms, with or without
@ -29,17 +29,14 @@
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
### ###
import json
import re import re
import sys import urllib.parse
import lxml.html
import supybot.utils as utils from supybot import callbacks, conf, ircutils, plugins, utils
from supybot.commands import wrap, getopts, additional from supybot.commands import wrap, getopts, additional
import supybot.plugins as plugins
import supybot.ircutils as ircutils
import supybot.callbacks as callbacks
try: try:
from supybot.i18n import PluginInternationalization from supybot.i18n import PluginInternationalization, internationalizeDocstring
from supybot.i18n import internationalizeDocstring
_ = PluginInternationalization('Wikifetch') _ = PluginInternationalization('Wikifetch')
except: except:
# This are useless functions that's allow to run the plugin on a bot # This are useless functions that's allow to run the plugin on a bot
@ -47,238 +44,92 @@ except:
_ = lambda x:x _ = lambda x:x
internationalizeDocstring = lambda x:x internationalizeDocstring = lambda x:x
if sys.version_info[0] < 3: from bs4 import BeautifulSoup
raise ImportError('This plugin requires Python 3. For a legacy version of this plugin that still ' import mwparserfromhell
'supports Python 2, consult the python2-legacy branch at '
'https://github.com/jlu5/SupyPlugins/tree/python2-legacy') HEADERS = {
from urllib.parse import quote_plus 'User-agent': 'Mozilla/5.0 (compatible; Supybot/Limnoria %s; Wikifetch plugin)' % conf.version
}
class Wikifetch(callbacks.Plugin): class Wikifetch(callbacks.Plugin):
"""Grabs data from Wikipedia and other MediaWiki-powered sites.""" """Grabs data from Wikipedia and other MediaWiki-powered sites."""
threaded = True threaded = True
# This defines a series of suffixes this should be added after the domain name. def _mediawiki_fetch(self, baseurl, title):
SPECIAL_URLS = {'wikia.com': '/wiki', params = urllib.parse.urlencode({
'wikipedia.org': '/wiki', 'action': 'parse',
'wiki.archlinux.org': '/index.php', 'page': title,
'wiki.gentoo.org': '/wiki', 'prop': 'wikitext|headhtml',
'mediawiki.org': '/wiki', 'formatversion': 2,
'wikimedia.org': '/wiki', 'format': 'json',
} 'redirects': True
})
url = f"{baseurl}?{params}"
def _get_article_tree(self, baseurl, query, use_mw_parsing=True): self.log.debug('Wikifetch: fetching link %s', url)
""" with utils.web.getUrlFd(url, headers=HEADERS) as fd:
Returns a wiki article tree given the base URL and search query. baseurl can be None, api_data = json.load(fd)
in which case, searching is skipped and the search query will be treated as a raw address.
"""
if baseurl is None: if error := api_data.get('error'):
addr = query error_code = error['code']
error_info = error['info']
raise callbacks.Error(f"MediaWiki API Error: {error_code} - {error_info} - {url}")
page_title = api_data['parse']['title']
content = api_data['parse']['wikitext']
html_head = api_data['parse']['headhtml']
mw = mwparserfromhell.parse(content)
for line in mw.strip_code().splitlines():
# Ignore stray image references that strip_code leaves behind
if re.search(r'\|?thumb\|', line):
continue
elif len(line) < 10:
continue
text = utils.str.normalizeWhitespace(line)
break
else: else:
# Different instances of MediaWiki use different URLs... This tries raise callbacks.Error(f"No text paragraph found for page {page_title!r}")
# to make the parser work for most sites, but still use resonable defaults
# such as filling in http:// and appending /wiki to links...
baseurl = baseurl.lower()
for match, suffix in self.SPECIAL_URLS.items():
if match in baseurl:
baseurl += suffix
break
# Add http:// to the URL if a scheme isn't specified soup = BeautifulSoup(html_head, features="lxml")
if not baseurl.startswith(('http://', 'https://')): url = ''
baseurl = 'http://' + baseurl if canonical_link := soup.find('link', rel='canonical'):
# Wikipedia
url = canonical_link.attrs['href']
elif og_url := soup.find('meta', property='og:url'):
# Fandom
url = og_url.attrs['content']
if use_mw_parsing: return (text, url)
# first, we get the page
addr = '%s/Special:Search?search=%s' % \
(baseurl, quote_plus(query))
else:
addr = '%s/%s' % (baseurl, query)
self.log.debug('Wikifetch: using URL %s', addr) def _wiki(self, irc, baseurl, title):
text, url = self._mediawiki_fetch(baseurl, title)
try: if url:
article = utils.web.getUrl(addr, timeout=3) irc.reply(utils.str.format("%s - %u", text, url))
except utils.web.Error:
self.log.exception('Failed to fetch link %s', addr)
raise
article = article.decode()
tree = lxml.html.document_fromstring(article)
return (tree, article, addr)
def _wiki(self, irc, msg, search, baseurl, use_mw_parsing=True):
"""Fetches and replies content from a MediaWiki-powered website."""
reply = ''
# First, fetch and parse the page
tree, article, addr = self._get_article_tree(baseurl, search, use_mw_parsing=use_mw_parsing)
# check if it gives a "Did you mean..." redirect
didyoumean = tree.xpath('//div[@class="searchdidyoumean"]/a'
'[@title="Special:Search"]')
if didyoumean:
redirect = didyoumean[0].text_content().strip()
if self.registryValue('showRedirects', msg.args[0]):
reply += _('I didn\'t find anything for "%s". '
'Did you mean "%s"? ') % (search, redirect)
tree, article, addr = self._get_article_tree(baseurl, didyoumean[0].get('href'))
search = redirect
# check if it's a page of search results (rather than an article), and
# if so, retrieve the first result
searchresults = tree.xpath('//div[@class="searchresults"]/ul/li/a') or \
tree.xpath('//article/ul/li/a') # Special case for Wikia (2017-01-27)
self.log.debug('Wikifetch: got search results %s', searchresults)
if searchresults:
redirect = searchresults[0].text_content().strip()
if self.registryValue('showRedirects', msg.args[0]):
reply += _('I didn\'t find anything for "%s", but here\'s the '
'result for "%s": ') % (search, redirect)
# Follow the search result and fetch that article. Note: use the original
# base url to prevent prefixes like "/wiki" from being added twice.
self.log.debug('Wikifetch: following search result:')
tree, article, addr = self._get_article_tree(None, searchresults[0].get('href'))
search = redirect
# extract the address we got it from - most sites have the perm link
# inside the page itself
try:
addr = tree.find(".//link[@rel='canonical']").attrib['href']
except (ValueError, AttributeError):
self.log.debug('Wikifetch: failed <link rel="canonical"> link extraction, skipping')
try:
addr = tree.find(".//div[@class='printfooter']/a").attrib['href']
addr = re.sub(r'([&?]|(amp;)?)oldid=\d+$', '', addr)
except (ValueError, AttributeError):
self.log.debug('Wikifetch: failed printfooter link extraction, skipping')
# If any of the above post-processing tricks fail, just ignore
pass
text_content = tree
if use_mw_parsing:
text_content = tree.xpath("//div[@class='mw-parser-output']") or tree.xpath("//div[@id='mw-content-text']")
if text_content:
text_content = text_content[0]
self.log.debug('Wikifetch: Using %s as text_content', text_content)
# check if it's a disambiguation page
disambig = tree.xpath('//table[@id="disambigbox"]') or \
tree.xpath('//table[@id="setindexbox"]') or \
tree.xpath('//div[contains(@class, "disambig")]') # Wikia (2017-01-27)
if disambig:
reply += format(_('%u is a disambiguation page. '), addr)
disambig = text_content.xpath('./ul/li')
disambig_results = []
for item in disambig:
for link in item.findall('a'):
if link.text is not None:
# Hackishly bold all <a> tags
link.text = "&#x02;%s&#x02;" % link.text
item = item.text_content().replace('&#x02;', '\x02')
# Normalize and strip whitespace, to prevent newlines and such
# from corrupting the display.
item = utils.str.normalizeWhitespace(item).strip()
disambig_results.append(item)
if disambig_results:
reply += format(_('Possible results include: %s'), '; '.join(disambig_results))
# Catch talk pages
elif 'ns-talk' in tree.find("body").attrib.get('class', ''):
reply += format(_('This article appears to be a talk page: %u'), addr)
else: else:
# Get the first paragraph as text. irc.reply(text)
paragraphs = []
for p in text_content.xpath("./p"):
self.log.debug('Wikifetch: looking at paragraph %s', p.text_content())
# Skip geographic coordinates, e.g. on articles for countries
if p.xpath(".//span[@class='geo-dec']"):
continue
# 2018-07-19: some articles have an empty p tag with this class and no content (why?)
elif 'mw-empty-elt' in p.attrib.get('class', ''):
continue
# Skip <p> tags with no content, for obvious reasons
elif not p.text_content().strip():
continue
paragraphs.append(p)
if (not paragraphs) or 'wiki/Special:Search' in addr:
if 'wikipedia:wikiproject' in addr.lower():
reply += format(_('This page appears to be a WikiProject page, '
'but it is too complex for us to parse: %u'), addr)
else:
irc.error(_('Not found, or page malformed.'), Raise=True)
else:
p = paragraphs[0]
# Replace <b> tags with IRC-style bold, this has to be
# done indirectly because unescaped '\x02' is invalid in XML
for b_tag in p.xpath('//b'):
b_tag.text = "&#x02;%s&#x02;" % (b_tag.text or '')
p = p.text_content()
p = p.replace('&#x02;', '\x02')
# Get rid of newlines, etc., that can corrupt the output.
p = utils.str.normalizeWhitespace(p)
p = p.strip()
if not p:
reply = _('<Page was too complex to parse>')
reply += format('%s %s %u', p, _('Retrieved from'), addr)
reply = reply.replace('&amp;','&')
# Remove inline citations (text[1][2][3]) as well as inline notes (text[note 1]).
reply = re.sub(r'\[[a-z ]*?\d+\]', '', reply)
return reply
@internationalizeDocstring @internationalizeDocstring
@wrap([getopts({'site': 'somethingWithoutSpaces', @wrap([getopts({'lang': 'somethingWithoutSpaces'}), 'text'])
'no-mw-parsing': ''}), def wiki(self, irc, msg, args, optlist, title):
'text']) """<page title>
def wiki(self, irc, msg, args, optlist, search):
"""[--site <site>] [--no-mw-parsing] <search term>
Returns the first paragraph of a wiki article. Optionally, a --site Returns the first paragraph of a Wikipedia article.
argument can be given to override the default (usually Wikipedia) -
try using '--site lyrics.wikia.com' or '--site wiki.archlinux.org'.
If the --no-mw-parsing option is given, MediaWiki-specific parsing is
disabled. This has the following effects:
1) No attempt at searching for a relevant Wiki page is made, and
an article with the same name as the search term is directly
retrieved.
2) The plugin will retrieve the first <p> tag found on a page,
regardless of where it's found, and print it as text. This may
not work on all sites, as some use <p> for navbars and headings
as well.
""" """
optlist = dict(optlist) optlist = dict(optlist)
baseurl = optlist.get('site') or self.registryValue('url', msg.args[0]) lang = optlist.get('lang') or \
text = self._wiki(irc, msg, search, baseurl, self.registryValue('wikipedia.lang', channel=msg.channel, network=irc.network)
use_mw_parsing=not optlist.get('no-mw-parsing'),
)
irc.reply(text) baseurl = f'https://{lang}.wikipedia.org/w/api.php'
self._wiki(irc, baseurl, title)
@internationalizeDocstring @wrap(['somethingWithoutSpaces', 'text'])
@wrap([additional('somethingWithoutSpaces')]) def fandom(self, irc, msg, args, wiki_subdomain, title):
def random(self, irc, msg, args, site): """<wiki subdomain> <title>
"""[<site>]
Returns the first paragraph of a random wiki article. Optionally, the 'site' Returns the first paragraph of a Fandom article.
argument can be given to override the default (usually Wikipedia).""" """
baseurl = site or self.registryValue('url', msg.args[0]) baseurl = f'https://{wiki_subdomain}.fandom.com/api.php'
text = self._wiki(irc, msg, 'Special:Random', baseurl) self._wiki(irc, baseurl, title)
irc.reply(text)
Class = Wikifetch Class = Wikifetch
# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:

View File

@ -3,4 +3,8 @@ from supybot.setup import plugin_setup
plugin_setup( plugin_setup(
'Wikifetch', 'Wikifetch',
install_requires=[
'bs4',
'mwparserfromhell',
],
) )

View File

@ -1,8 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
### ###
# Copyright (c) 2010, quantumlemur # Copyright (c) 2023 James Lu
# Copyright (c) 2011, Valentin Lorentz
# Copyright (c) 2015,2017 James Lu <james@overdrivenetworks.com>
# All rights reserved. # All rights reserved.
# #
# Redistribution and use in source and binary forms, with or without # Redistribution and use in source and binary forms, with or without
@ -37,79 +35,27 @@ if network:
plugins = ('Wikifetch',) plugins = ('Wikifetch',)
def testWikipedia(self): def testWikipedia(self):
self.assertRegexp('wiki Monty Python', self.assertRegexp('wiki Vancouver',
r'\x02Monty Python\x02.*?\.') r'^Vancouver.*Canada')
self.assertRegexp('wiki roegdfjpoepo', self.assertRegexp('wiki Python (programming language)',
'Not found, or page malformed.*') 'Python')
# Follow redirects
self.assertRegexp('wiki CYVR',
'Vancouver International Airport')
def testStripInlineCitations(self): # Display MW errors
self.assertNotRegexp('wiki UNICEF', '\[\d+\]') self.assertRegexp('wiki NotFoundTest',
"missingtitle - The page you specified doesn't exist")
def testIgnoreCoordinates(self): def testWikipediaLang(self):
# Articles for countries, cities, landmarks, etc. have GPS coordinates added to the top right. self.assertRegexp('wiki --lang fr Paris', 'Paris.*capitale')
# These should be ignored because we want to focus on the actual article text.
self.assertNotRegexp('wiki Canada', 'Coordinates\:')
self.assertNotRegexp('wiki Eiffel Tower', 'Coordinates\:')
self.assertNotRegexp('wiki Poland', 'Coordinates\:')
def testDisambig(self): with conf.supybot.plugins.Wikifetch.Wikipedia.lang.context('zh'):
self.assertRegexp('wiki Python', 'is a disambiguation page.*' self.assertRegexp('wiki 地球', '地球.*太阳系')
'Possible results include:.*?Pythonidae.*?;.*?;')
self.assertRegexp('wiki Fire (disambiguation)', '.*Possible results include:.*')
def testDisambigStripSpaces(self):
self.assertNotRegexp('wiki Na', '\n')
def testArticlesWithSymbolsInName(self):
self.assertNotError('wiki /')
self.assertNotError('wiki *')
self.assertNotError('wiki GNU/Linux')
self.assertNotError('wiki --site en.wikipedia.org /')
def testFollowRedirects(self):
self.assertRegexp('wiki YVR', 'Vancouver International Airport')
def testWikiBold(self):
self.assertRegexp('wiki Apple', '\x02')
# Complex combination of the <a> tag inside a <b> tag; we should always use
# empty bold content instead of the text "None".
self.assertNotRegexp('wiki Fallstreak hole', 'None')
def testWikiRandom(self):
self.assertNotError('random')
def testSiteCombinations(self):
self.assertNotError('wiki --site en.wikipedia.org Bread')
self.assertNotError('wiki --site https://en.wikipedia.org Bread')
def testNonEnglishWikipedia(self):
self.assertNotError('wiki --site fr.wikipedia.org Paris')
self.assertNotError('wiki --site de.wikipedia.org Berlin')
self.assertNotError('wiki --site zh.wikipedia.org 中文')
self.assertNotError('wiki --site ar.wikipedia.org 2017')
class Fandom(PluginTestCase):
plugins = ('Wikifetch',)
def testFandom(self): def testFandom(self):
self.assertNotError('wiki --site help.fandom.com Formatting') self.assertRegexp('fandom minecraft Ender Dragon',
r'[Ee]nder [Dd]ragon.*boss')
class ArchLinuxWiki(PluginTestCase): self.assertRegexp('fandom terraria Ocean', r'Ocean.*biome')
plugins = ('Wikifetch',)
def testArchWiki(self):
self.assertNotError('wiki --site wiki.archlinux.org Bash')
class GentooWiki(PluginTestCase):
plugins = ('Wikifetch',)
def testGentooWiki(self):
self.assertNotError('wiki --site wiki.gentoo.org OpenRC')
class WikimediaSites(PluginTestCase):
plugins = ('Wikifetch',)
def testMediaWiki(self):
self.assertNotError('wiki --site mediawiki.org Sites using MediaWiki')
# vim:set shiftwidth=4 tabstop=4 expandtab textwidth=79: