Wikifetch: rewrite plugin using MediaWiki API

This commit is contained in:
James Lu 2023-02-12 16:07:47 -08:00
parent 31c97a646b
commit 582a02f74e
5 changed files with 102 additions and 307 deletions

View File

@ -77,5 +77,5 @@ Most of these plugins also have their own READMEs in their folders; you can usua
- Translates text through Google Translate multiple times in order to get amusing results.
### Wikifetch
- Fork of [ProgVal's Wikipedia plugin](https://github.com/ProgVal/Supybot-plugins), with support for other wikis (via a `--site` option) and other improvements.
- **Requires:** [lxml](https://lxml.de/installation.html)
- Fetch content from MediaWiki-powered sites (Wikipedia, Fandom)
- **Requires:** [Beautiful Soup 4](http://www.crummy.com/software/BeautifulSoup/bs4/doc/), [mwparserfromhell](https://mwparserfromhell.readthedocs.io/)

View File

@ -1,7 +1,5 @@
###
# Copyright (c) 2010, quantumlemur
# Copyright (c) 2011, Valentin Lorentz
# Copyright (c) 2015, James Lu
# Copyright (c) 2023 James Lu
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@ -30,11 +28,9 @@
###
import supybot.conf as conf
import supybot.registry as registry
from supybot import conf, registry
try:
from supybot.i18n import PluginInternationalization
from supybot.i18n import internationalizeDocstring
from supybot.i18n import PluginInternationalization, internationalizeDocstring
_ = PluginInternationalization('Wikifetch')
except:
# This are useless functions that's allow to run the plugin on a bot
@ -50,10 +46,8 @@ def configure(advanced):
from supybot.questions import expect, anything, something, yn
conf.registerPlugin('Wikifetch', True)
Wikifetch = conf.registerPlugin('Wikifetch')
conf.registerChannelValue(Wikifetch, 'url',
registry.String(_('en.wikipedia.org'), _("""Default URL of the
website to pull from.""")))
# vim:set shiftwidth=4 tabstop=4 expandtab textwidth=79:
conf.registerGroup(Wikifetch, 'wikipedia')
conf.registerChannelValue(Wikifetch.wikipedia, 'lang',
registry.String('en', _("""Default Wikipedia language""")))

View File

@ -1,7 +1,7 @@
###
# Copyright (c) 2010, quantumlemur
# Copyright (c) 2011, Valentin Lorentz
# Copyright (c) 2015,2017 James Lu <james@overdrivenetworks.com>
# Copyright (c) 2015-2023 James Lu
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@ -29,17 +29,14 @@
# POSSIBILITY OF SUCH DAMAGE.
###
import json
import re
import sys
import lxml.html
import supybot.utils as utils
import urllib.parse
from supybot import callbacks, conf, ircutils, plugins, utils
from supybot.commands import wrap, getopts, additional
import supybot.plugins as plugins
import supybot.ircutils as ircutils
import supybot.callbacks as callbacks
try:
from supybot.i18n import PluginInternationalization
from supybot.i18n import internationalizeDocstring
from supybot.i18n import PluginInternationalization, internationalizeDocstring
_ = PluginInternationalization('Wikifetch')
except:
# This are useless functions that's allow to run the plugin on a bot
@ -47,238 +44,92 @@ except:
_ = lambda x:x
internationalizeDocstring = lambda x:x
if sys.version_info[0] < 3:
raise ImportError('This plugin requires Python 3. For a legacy version of this plugin that still '
'supports Python 2, consult the python2-legacy branch at '
'https://github.com/jlu5/SupyPlugins/tree/python2-legacy')
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
import mwparserfromhell
HEADERS = {
'User-agent': 'Mozilla/5.0 (compatible; Supybot/Limnoria %s; Wikifetch plugin)' % conf.version
}
class Wikifetch(callbacks.Plugin):
"""Grabs data from Wikipedia and other MediaWiki-powered sites."""
threaded = True
# This defines a series of suffixes this should be added after the domain name.
SPECIAL_URLS = {'wikia.com': '/wiki',
'wikipedia.org': '/wiki',
'wiki.archlinux.org': '/index.php',
'wiki.gentoo.org': '/wiki',
'mediawiki.org': '/wiki',
'wikimedia.org': '/wiki',
}
def _mediawiki_fetch(self, baseurl, title):
params = urllib.parse.urlencode({
'action': 'parse',
'page': title,
'prop': 'wikitext|headhtml',
'formatversion': 2,
'format': 'json',
'redirects': True
})
url = f"{baseurl}?{params}"
def _get_article_tree(self, baseurl, query, use_mw_parsing=True):
"""
Returns a wiki article tree given the base URL and search query. baseurl can be None,
in which case, searching is skipped and the search query will be treated as a raw address.
"""
self.log.debug('Wikifetch: fetching link %s', url)
with utils.web.getUrlFd(url, headers=HEADERS) as fd:
api_data = json.load(fd)
if baseurl is None:
addr = query
else:
# Different instances of MediaWiki use different URLs... This tries
# to make the parser work for most sites, but still use resonable defaults
# such as filling in http:// and appending /wiki to links...
baseurl = baseurl.lower()
for match, suffix in self.SPECIAL_URLS.items():
if match in baseurl:
baseurl += suffix
if error := api_data.get('error'):
error_code = error['code']
error_info = error['info']
raise callbacks.Error(f"MediaWiki API Error: {error_code} - {error_info} - {url}")
page_title = api_data['parse']['title']
content = api_data['parse']['wikitext']
html_head = api_data['parse']['headhtml']
mw = mwparserfromhell.parse(content)
for line in mw.strip_code().splitlines():
# Ignore stray image references that strip_code leaves behind
if re.search(r'\|?thumb\|', line):
continue
elif len(line) < 10:
continue
text = utils.str.normalizeWhitespace(line)
break
# Add http:// to the URL if a scheme isn't specified
if not baseurl.startswith(('http://', 'https://')):
baseurl = 'http://' + baseurl
if use_mw_parsing:
# first, we get the page
addr = '%s/Special:Search?search=%s' % \
(baseurl, quote_plus(query))
else:
addr = '%s/%s' % (baseurl, query)
raise callbacks.Error(f"No text paragraph found for page {page_title!r}")
self.log.debug('Wikifetch: using URL %s', addr)
soup = BeautifulSoup(html_head, features="lxml")
url = ''
if canonical_link := soup.find('link', rel='canonical'):
# Wikipedia
url = canonical_link.attrs['href']
elif og_url := soup.find('meta', property='og:url'):
# Fandom
url = og_url.attrs['content']
try:
article = utils.web.getUrl(addr, timeout=3)
except utils.web.Error:
self.log.exception('Failed to fetch link %s', addr)
raise
return (text, url)
article = article.decode()
tree = lxml.html.document_fromstring(article)
return (tree, article, addr)
def _wiki(self, irc, msg, search, baseurl, use_mw_parsing=True):
"""Fetches and replies content from a MediaWiki-powered website."""
reply = ''
# First, fetch and parse the page
tree, article, addr = self._get_article_tree(baseurl, search, use_mw_parsing=use_mw_parsing)
# check if it gives a "Did you mean..." redirect
didyoumean = tree.xpath('//div[@class="searchdidyoumean"]/a'
'[@title="Special:Search"]')
if didyoumean:
redirect = didyoumean[0].text_content().strip()
if self.registryValue('showRedirects', msg.args[0]):
reply += _('I didn\'t find anything for "%s". '
'Did you mean "%s"? ') % (search, redirect)
tree, article, addr = self._get_article_tree(baseurl, didyoumean[0].get('href'))
search = redirect
# check if it's a page of search results (rather than an article), and
# if so, retrieve the first result
searchresults = tree.xpath('//div[@class="searchresults"]/ul/li/a') or \
tree.xpath('//article/ul/li/a') # Special case for Wikia (2017-01-27)
self.log.debug('Wikifetch: got search results %s', searchresults)
if searchresults:
redirect = searchresults[0].text_content().strip()
if self.registryValue('showRedirects', msg.args[0]):
reply += _('I didn\'t find anything for "%s", but here\'s the '
'result for "%s": ') % (search, redirect)
# Follow the search result and fetch that article. Note: use the original
# base url to prevent prefixes like "/wiki" from being added twice.
self.log.debug('Wikifetch: following search result:')
tree, article, addr = self._get_article_tree(None, searchresults[0].get('href'))
search = redirect
# extract the address we got it from - most sites have the perm link
# inside the page itself
try:
addr = tree.find(".//link[@rel='canonical']").attrib['href']
except (ValueError, AttributeError):
self.log.debug('Wikifetch: failed <link rel="canonical"> link extraction, skipping')
try:
addr = tree.find(".//div[@class='printfooter']/a").attrib['href']
addr = re.sub(r'([&?]|(amp;)?)oldid=\d+$', '', addr)
except (ValueError, AttributeError):
self.log.debug('Wikifetch: failed printfooter link extraction, skipping')
# If any of the above post-processing tricks fail, just ignore
pass
text_content = tree
if use_mw_parsing:
text_content = tree.xpath("//div[@class='mw-parser-output']") or tree.xpath("//div[@id='mw-content-text']")
if text_content:
text_content = text_content[0]
self.log.debug('Wikifetch: Using %s as text_content', text_content)
# check if it's a disambiguation page
disambig = tree.xpath('//table[@id="disambigbox"]') or \
tree.xpath('//table[@id="setindexbox"]') or \
tree.xpath('//div[contains(@class, "disambig")]') # Wikia (2017-01-27)
if disambig:
reply += format(_('%u is a disambiguation page. '), addr)
disambig = text_content.xpath('./ul/li')
disambig_results = []
for item in disambig:
for link in item.findall('a'):
if link.text is not None:
# Hackishly bold all <a> tags
link.text = "&#x02;%s&#x02;" % link.text
item = item.text_content().replace('&#x02;', '\x02')
# Normalize and strip whitespace, to prevent newlines and such
# from corrupting the display.
item = utils.str.normalizeWhitespace(item).strip()
disambig_results.append(item)
if disambig_results:
reply += format(_('Possible results include: %s'), '; '.join(disambig_results))
# Catch talk pages
elif 'ns-talk' in tree.find("body").attrib.get('class', ''):
reply += format(_('This article appears to be a talk page: %u'), addr)
def _wiki(self, irc, baseurl, title):
text, url = self._mediawiki_fetch(baseurl, title)
if url:
irc.reply(utils.str.format("%s - %u", text, url))
else:
# Get the first paragraph as text.
paragraphs = []
for p in text_content.xpath("./p"):
self.log.debug('Wikifetch: looking at paragraph %s', p.text_content())
# Skip geographic coordinates, e.g. on articles for countries
if p.xpath(".//span[@class='geo-dec']"):
continue
# 2018-07-19: some articles have an empty p tag with this class and no content (why?)
elif 'mw-empty-elt' in p.attrib.get('class', ''):
continue
# Skip <p> tags with no content, for obvious reasons
elif not p.text_content().strip():
continue
paragraphs.append(p)
if (not paragraphs) or 'wiki/Special:Search' in addr:
if 'wikipedia:wikiproject' in addr.lower():
reply += format(_('This page appears to be a WikiProject page, '
'but it is too complex for us to parse: %u'), addr)
else:
irc.error(_('Not found, or page malformed.'), Raise=True)
else:
p = paragraphs[0]
# Replace <b> tags with IRC-style bold, this has to be
# done indirectly because unescaped '\x02' is invalid in XML
for b_tag in p.xpath('//b'):
b_tag.text = "&#x02;%s&#x02;" % (b_tag.text or '')
p = p.text_content()
p = p.replace('&#x02;', '\x02')
# Get rid of newlines, etc., that can corrupt the output.
p = utils.str.normalizeWhitespace(p)
p = p.strip()
if not p:
reply = _('<Page was too complex to parse>')
reply += format('%s %s %u', p, _('Retrieved from'), addr)
reply = reply.replace('&amp;','&')
# Remove inline citations (text[1][2][3]) as well as inline notes (text[note 1]).
reply = re.sub(r'\[[a-z ]*?\d+\]', '', reply)
return reply
irc.reply(text)
@internationalizeDocstring
@wrap([getopts({'site': 'somethingWithoutSpaces',
'no-mw-parsing': ''}),
'text'])
def wiki(self, irc, msg, args, optlist, search):
"""[--site <site>] [--no-mw-parsing] <search term>
@wrap([getopts({'lang': 'somethingWithoutSpaces'}), 'text'])
def wiki(self, irc, msg, args, optlist, title):
"""<page title>
Returns the first paragraph of a wiki article. Optionally, a --site
argument can be given to override the default (usually Wikipedia) -
try using '--site lyrics.wikia.com' or '--site wiki.archlinux.org'.
If the --no-mw-parsing option is given, MediaWiki-specific parsing is
disabled. This has the following effects:
1) No attempt at searching for a relevant Wiki page is made, and
an article with the same name as the search term is directly
retrieved.
2) The plugin will retrieve the first <p> tag found on a page,
regardless of where it's found, and print it as text. This may
not work on all sites, as some use <p> for navbars and headings
as well.
Returns the first paragraph of a Wikipedia article.
"""
optlist = dict(optlist)
baseurl = optlist.get('site') or self.registryValue('url', msg.args[0])
text = self._wiki(irc, msg, search, baseurl,
use_mw_parsing=not optlist.get('no-mw-parsing'),
)
lang = optlist.get('lang') or \
self.registryValue('wikipedia.lang', channel=msg.channel, network=irc.network)
irc.reply(text)
baseurl = f'https://{lang}.wikipedia.org/w/api.php'
self._wiki(irc, baseurl, title)
@internationalizeDocstring
@wrap([additional('somethingWithoutSpaces')])
def random(self, irc, msg, args, site):
"""[<site>]
@wrap(['somethingWithoutSpaces', 'text'])
def fandom(self, irc, msg, args, wiki_subdomain, title):
"""<wiki subdomain> <title>
Returns the first paragraph of a random wiki article. Optionally, the 'site'
argument can be given to override the default (usually Wikipedia)."""
baseurl = site or self.registryValue('url', msg.args[0])
text = self._wiki(irc, msg, 'Special:Random', baseurl)
Returns the first paragraph of a Fandom article.
"""
baseurl = f'https://{wiki_subdomain}.fandom.com/api.php'
self._wiki(irc, baseurl, title)
irc.reply(text)
Class = Wikifetch
# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:

View File

@ -3,4 +3,8 @@ from supybot.setup import plugin_setup
plugin_setup(
'Wikifetch',
install_requires=[
'bs4',
'mwparserfromhell',
],
)

View File

@ -1,8 +1,6 @@
# -*- coding: utf-8 -*-
###
# Copyright (c) 2010, quantumlemur
# Copyright (c) 2011, Valentin Lorentz
# Copyright (c) 2015,2017 James Lu <james@overdrivenetworks.com>
# Copyright (c) 2023 James Lu
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@ -37,79 +35,27 @@ if network:
plugins = ('Wikifetch',)
def testWikipedia(self):
self.assertRegexp('wiki Monty Python',
r'\x02Monty Python\x02.*?\.')
self.assertRegexp('wiki roegdfjpoepo',
'Not found, or page malformed.*')
self.assertRegexp('wiki Vancouver',
r'^Vancouver.*Canada')
self.assertRegexp('wiki Python (programming language)',
'Python')
# Follow redirects
self.assertRegexp('wiki CYVR',
'Vancouver International Airport')
def testStripInlineCitations(self):
self.assertNotRegexp('wiki UNICEF', '\[\d+\]')
# Display MW errors
self.assertRegexp('wiki NotFoundTest',
"missingtitle - The page you specified doesn't exist")
def testIgnoreCoordinates(self):
# Articles for countries, cities, landmarks, etc. have GPS coordinates added to the top right.
# These should be ignored because we want to focus on the actual article text.
self.assertNotRegexp('wiki Canada', 'Coordinates\:')
self.assertNotRegexp('wiki Eiffel Tower', 'Coordinates\:')
self.assertNotRegexp('wiki Poland', 'Coordinates\:')
def testWikipediaLang(self):
self.assertRegexp('wiki --lang fr Paris', 'Paris.*capitale')
def testDisambig(self):
self.assertRegexp('wiki Python', 'is a disambiguation page.*'
'Possible results include:.*?Pythonidae.*?;.*?;')
self.assertRegexp('wiki Fire (disambiguation)', '.*Possible results include:.*')
def testDisambigStripSpaces(self):
self.assertNotRegexp('wiki Na', '\n')
def testArticlesWithSymbolsInName(self):
self.assertNotError('wiki /')
self.assertNotError('wiki *')
self.assertNotError('wiki GNU/Linux')
self.assertNotError('wiki --site en.wikipedia.org /')
def testFollowRedirects(self):
self.assertRegexp('wiki YVR', 'Vancouver International Airport')
def testWikiBold(self):
self.assertRegexp('wiki Apple', '\x02')
# Complex combination of the <a> tag inside a <b> tag; we should always use
# empty bold content instead of the text "None".
self.assertNotRegexp('wiki Fallstreak hole', 'None')
def testWikiRandom(self):
self.assertNotError('random')
def testSiteCombinations(self):
self.assertNotError('wiki --site en.wikipedia.org Bread')
self.assertNotError('wiki --site https://en.wikipedia.org Bread')
def testNonEnglishWikipedia(self):
self.assertNotError('wiki --site fr.wikipedia.org Paris')
self.assertNotError('wiki --site de.wikipedia.org Berlin')
self.assertNotError('wiki --site zh.wikipedia.org 中文')
self.assertNotError('wiki --site ar.wikipedia.org 2017')
class Fandom(PluginTestCase):
plugins = ('Wikifetch',)
with conf.supybot.plugins.Wikifetch.Wikipedia.lang.context('zh'):
self.assertRegexp('wiki 地球', '地球.*太阳系')
def testFandom(self):
self.assertNotError('wiki --site help.fandom.com Formatting')
self.assertRegexp('fandom minecraft Ender Dragon',
r'[Ee]nder [Dd]ragon.*boss')
class ArchLinuxWiki(PluginTestCase):
plugins = ('Wikifetch',)
self.assertRegexp('fandom terraria Ocean', r'Ocean.*biome')
def testArchWiki(self):
self.assertNotError('wiki --site wiki.archlinux.org Bash')
class GentooWiki(PluginTestCase):
plugins = ('Wikifetch',)
def testGentooWiki(self):
self.assertNotError('wiki --site wiki.gentoo.org OpenRC')
class WikimediaSites(PluginTestCase):
plugins = ('Wikifetch',)
def testMediaWiki(self):
self.assertNotError('wiki --site mediawiki.org Sites using MediaWiki')
# vim:set shiftwidth=4 tabstop=4 expandtab textwidth=79: