Wikifetch: rewrite plugin using MediaWiki API

2025-04-26 04:51:08 -05:00 · 2023-02-12 16:07:47 -08:00 · 2023-02-12 16:07:47 -08:00 · 582a02f74e
commit 582a02f74e
parent 31c97a646b
5 changed files with 102 additions and 307 deletions
--- a/README.md
+++ b/README.md
@ -77,5 +77,5 @@ Most of these plugins also have their own READMEs in their folders; you can usua
 - Translates text through Google Translate multiple times in order to get amusing results.

 ### Wikifetch
- Fork of [ProgVal's Wikipedia plugin](https://github.com/ProgVal/Supybot-plugins), with support for other wikis (via a `--site` option) and other improvements.
-   - **Requires:** [lxml](https://lxml.de/installation.html)
+- Fetch content from MediaWiki-powered sites (Wikipedia, Fandom)
+   - **Requires:** [Beautiful Soup 4](http://www.crummy.com/software/BeautifulSoup/bs4/doc/), [mwparserfromhell](https://mwparserfromhell.readthedocs.io/)
--- a/Wikifetch/config.py
+++ b/Wikifetch/config.py
@ -1,7 +1,5 @@
 ###
-# Copyright (c) 2010, quantumlemur
-# Copyright (c) 2011, Valentin Lorentz
-# Copyright (c) 2015, James Lu
+# Copyright (c) 2023 James Lu
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@ -30,11 +28,9 @@

 ###

-import supybot.conf as conf
-import supybot.registry as registry
+from supybot import conf, registry
 try:
-    from supybot.i18n import PluginInternationalization
-    from supybot.i18n import internationalizeDocstring
+    from supybot.i18n import PluginInternationalization, internationalizeDocstring
    _ = PluginInternationalization('Wikifetch')
 except:
    # This are useless functions that's allow to run the plugin on a bot
@ -50,10 +46,8 @@ def configure(advanced):
    from supybot.questions import expect, anything, something, yn
    conf.registerPlugin('Wikifetch', True)

-
 Wikifetch = conf.registerPlugin('Wikifetch')
-conf.registerChannelValue(Wikifetch, 'url',
-        registry.String(_('en.wikipedia.org'), _("""Default URL of the
-        website to pull from.""")))

-# vim:set shiftwidth=4 tabstop=4 expandtab textwidth=79:
+conf.registerGroup(Wikifetch, 'wikipedia')
+conf.registerChannelValue(Wikifetch.wikipedia, 'lang',
+        registry.String('en', _("""Default Wikipedia language""")))
--- a/Wikifetch/plugin.py
+++ b/Wikifetch/plugin.py
@ -1,7 +1,7 @@
 ###
 # Copyright (c) 2010, quantumlemur
 # Copyright (c) 2011, Valentin Lorentz
-# Copyright (c) 2015,2017 James Lu <james@overdrivenetworks.com>
+# Copyright (c) 2015-2023 James Lu
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@ -29,17 +29,14 @@
 # POSSIBILITY OF SUCH DAMAGE.
 ###

+import json
 import re
-import sys
-import lxml.html
-import supybot.utils as utils
+import urllib.parse
+
+from supybot import callbacks, conf, ircutils, plugins, utils
 from supybot.commands import wrap, getopts, additional
-import supybot.plugins as plugins
-import supybot.ircutils as ircutils
-import supybot.callbacks as callbacks
 try:
-    from supybot.i18n import PluginInternationalization
-    from supybot.i18n import internationalizeDocstring
+    from supybot.i18n import PluginInternationalization, internationalizeDocstring
    _ = PluginInternationalization('Wikifetch')
 except:
    # This are useless functions that's allow to run the plugin on a bot
@ -47,238 +44,92 @@ except:
    _ = lambda x:x
    internationalizeDocstring = lambda x:x

-if sys.version_info[0] < 3:
-    raise ImportError('This plugin requires Python 3. For a legacy version of this plugin that still '
-                      'supports Python 2, consult the python2-legacy branch at '
-                      'https://github.com/jlu5/SupyPlugins/tree/python2-legacy')
-from urllib.parse import quote_plus
+from bs4 import BeautifulSoup
+import mwparserfromhell
+
+HEADERS = {
+    'User-agent': 'Mozilla/5.0 (compatible; Supybot/Limnoria %s; Wikifetch plugin)' % conf.version
+}

 class Wikifetch(callbacks.Plugin):
    """Grabs data from Wikipedia and other MediaWiki-powered sites."""
    threaded = True

-    # This defines a series of suffixes this should be added after the domain name.
-    SPECIAL_URLS = {'wikia.com': '/wiki',
-                    'wikipedia.org': '/wiki',
-                    'wiki.archlinux.org': '/index.php',
-                    'wiki.gentoo.org': '/wiki',
-                    'mediawiki.org': '/wiki',
-                    'wikimedia.org': '/wiki',
-                   }
+    def _mediawiki_fetch(self, baseurl, title):
+        params = urllib.parse.urlencode({
+            'action': 'parse',
+            'page': title,
+            'prop': 'wikitext|headhtml',
+            'formatversion': 2,
+            'format': 'json',
+            'redirects': True
+        })
+        url = f"{baseurl}?{params}"

-    def _get_article_tree(self, baseurl, query, use_mw_parsing=True):
-        """
-        Returns a wiki article tree given the base URL and search query. baseurl can be None,
-        in which case, searching is skipped and the search query will be treated as a raw address.
-        """
+        self.log.debug('Wikifetch: fetching link %s', url)
+        with utils.web.getUrlFd(url, headers=HEADERS) as fd:
+            api_data = json.load(fd)

-        if baseurl is None:
-            addr = query
-        else:
-            # Different instances of MediaWiki use different URLs... This tries
-            # to make the parser work for most sites, but still use resonable defaults
-            # such as filling in http:// and appending /wiki to links...
-            baseurl = baseurl.lower()
-            for match, suffix in self.SPECIAL_URLS.items():
-                if match in baseurl:
-                    baseurl += suffix
+        if error := api_data.get('error'):
+            error_code = error['code']
+            error_info = error['info']
+            raise callbacks.Error(f"MediaWiki API Error: {error_code} - {error_info} - {url}")
+
+        page_title = api_data['parse']['title']
+        content = api_data['parse']['wikitext']
+        html_head = api_data['parse']['headhtml']
+        mw = mwparserfromhell.parse(content)
+        for line in mw.strip_code().splitlines():
+            # Ignore stray image references that strip_code leaves behind
+            if re.search(r'\|?thumb\|', line):
+                continue
+            elif len(line) < 10:
+                continue
+            text = utils.str.normalizeWhitespace(line)
            break
-
-            # Add http:// to the URL if a scheme isn't specified
-            if not baseurl.startswith(('http://', 'https://')):
-                baseurl = 'http://' + baseurl
-
-            if use_mw_parsing:
-                # first, we get the page
-                addr = '%s/Special:Search?search=%s' % \
-                            (baseurl, quote_plus(query))
        else:
-                addr = '%s/%s' % (baseurl, query)
+            raise callbacks.Error(f"No text paragraph found for page {page_title!r}")

-        self.log.debug('Wikifetch: using URL %s', addr)
+        soup = BeautifulSoup(html_head, features="lxml")
+        url = ''
+        if canonical_link := soup.find('link', rel='canonical'):
+            # Wikipedia
+            url = canonical_link.attrs['href']
+        elif og_url := soup.find('meta', property='og:url'):
+            # Fandom
+            url = og_url.attrs['content']

-        try:
-            article = utils.web.getUrl(addr, timeout=3)
-        except utils.web.Error:
-            self.log.exception('Failed to fetch link %s', addr)
-            raise
+        return (text, url)

-        article = article.decode()
-
-        tree = lxml.html.document_fromstring(article)
-        return (tree, article, addr)
-
-    def _wiki(self, irc, msg, search, baseurl, use_mw_parsing=True):
-        """Fetches and replies content from a MediaWiki-powered website."""
-        reply = ''
-
-        # First, fetch and parse the page
-        tree, article, addr = self._get_article_tree(baseurl, search, use_mw_parsing=use_mw_parsing)
-
-        # check if it gives a "Did you mean..." redirect
-        didyoumean = tree.xpath('//div[@class="searchdidyoumean"]/a'
-                                '[@title="Special:Search"]')
-        if didyoumean:
-            redirect = didyoumean[0].text_content().strip()
-            if self.registryValue('showRedirects', msg.args[0]):
-                reply += _('I didn\'t find anything for "%s". '
-                           'Did you mean "%s"? ') % (search, redirect)
-
-            tree, article, addr = self._get_article_tree(baseurl, didyoumean[0].get('href'))
-            search = redirect
-
-        # check if it's a page of search results (rather than an article), and
-        # if so, retrieve the first result
-        searchresults = tree.xpath('//div[@class="searchresults"]/ul/li/a') or \
-            tree.xpath('//article/ul/li/a') # Special case for Wikia (2017-01-27)
-        self.log.debug('Wikifetch: got search results %s', searchresults)
-
-        if searchresults:
-            redirect = searchresults[0].text_content().strip()
-            if self.registryValue('showRedirects', msg.args[0]):
-                reply += _('I didn\'t find anything for "%s", but here\'s the '
-                           'result for "%s": ') % (search, redirect)
-            # Follow the search result and fetch that article. Note: use the original
-            # base url to prevent prefixes like "/wiki" from being added twice.
-            self.log.debug('Wikifetch: following search result:')
-            tree, article, addr = self._get_article_tree(None, searchresults[0].get('href'))
-            search = redirect
-
-        # extract the address we got it from - most sites have the perm link
-        # inside the page itself
-        try:
-            addr = tree.find(".//link[@rel='canonical']").attrib['href']
-        except (ValueError, AttributeError):
-            self.log.debug('Wikifetch: failed <link rel="canonical"> link extraction, skipping')
-            try:
-                addr = tree.find(".//div[@class='printfooter']/a").attrib['href']
-                addr = re.sub(r'([&?]|(amp;)?)oldid=\d+$', '', addr)
-            except (ValueError, AttributeError):
-                self.log.debug('Wikifetch: failed printfooter link extraction, skipping')
-                # If any of the above post-processing tricks fail, just ignore
-                pass
-
-        text_content = tree
-        if use_mw_parsing:
-            text_content = tree.xpath("//div[@class='mw-parser-output']") or tree.xpath("//div[@id='mw-content-text']")
-            if text_content:
-                text_content = text_content[0]
-        self.log.debug('Wikifetch: Using %s as text_content', text_content)
-
-        # check if it's a disambiguation page
-        disambig = tree.xpath('//table[@id="disambigbox"]') or \
-            tree.xpath('//table[@id="setindexbox"]') or \
-            tree.xpath('//div[contains(@class, "disambig")]')  # Wikia (2017-01-27)
-        if disambig:
-            reply += format(_('%u is a disambiguation page. '), addr)
-            disambig = text_content.xpath('./ul/li')
-
-            disambig_results = []
-            for item in disambig:
-                for link in item.findall('a'):
-                    if link.text is not None:
-                        # Hackishly bold all <a> tags
-                        link.text = "&#x02;%s&#x02;" % link.text
-                item = item.text_content().replace('&#x02;', '\x02')
-                # Normalize and strip whitespace, to prevent newlines and such
-                # from corrupting the display.
-                item = utils.str.normalizeWhitespace(item).strip()
-                disambig_results.append(item)
-            if disambig_results:
-                reply += format(_('Possible results include: %s'), '; '.join(disambig_results))
-
-        # Catch talk pages
-        elif 'ns-talk' in tree.find("body").attrib.get('class', ''):
-            reply += format(_('This article appears to be a talk page: %u'), addr)
+    def _wiki(self, irc, baseurl, title):
+        text, url = self._mediawiki_fetch(baseurl, title)
+        if url:
+            irc.reply(utils.str.format("%s - %u", text, url))
        else:
-            # Get the first paragraph as text.
-            paragraphs = []
-            for p in text_content.xpath("./p"):
-                self.log.debug('Wikifetch: looking at paragraph %s', p.text_content())
-
-                # Skip geographic coordinates, e.g. on articles for countries
-                if p.xpath(".//span[@class='geo-dec']"):
-                    continue
-                # 2018-07-19: some articles have an empty p tag with this class and no content (why?)
-                elif 'mw-empty-elt' in p.attrib.get('class', ''):
-                    continue
-                # Skip <p> tags with no content, for obvious reasons
-                elif not p.text_content().strip():
-                    continue
-
-                paragraphs.append(p)
-
-            if (not paragraphs) or 'wiki/Special:Search' in addr:
-                if 'wikipedia:wikiproject' in addr.lower():
-                    reply += format(_('This page appears to be a WikiProject page, '
-                               'but it is too complex for us to parse: %u'), addr)
-                else:
-                    irc.error(_('Not found, or page malformed.'), Raise=True)
-            else:
-                p = paragraphs[0]
-                # Replace <b> tags with IRC-style bold, this has to be
-                # done indirectly because unescaped '\x02' is invalid in XML
-                for b_tag in p.xpath('//b'):
-                    b_tag.text = "&#x02;%s&#x02;" % (b_tag.text or '')
-                p = p.text_content()
-                p = p.replace('&#x02;', '\x02')
-                # Get rid of newlines, etc., that can corrupt the output.
-                p = utils.str.normalizeWhitespace(p)
-                p = p.strip()
-
-                if not p:
-                    reply = _('<Page was too complex to parse>')
-
-                reply += format('%s %s %u', p, _('Retrieved from'), addr)
-        reply = reply.replace('&amp;','&')
-
-        # Remove inline citations (text[1][2][3]) as well as inline notes (text[note 1]).
-        reply = re.sub(r'\[[a-z ]*?\d+\]', '', reply)
-
-        return reply
+            irc.reply(text)

    @internationalizeDocstring
-    @wrap([getopts({'site': 'somethingWithoutSpaces',
-                    'no-mw-parsing': ''}),
-          'text'])
-    def wiki(self, irc, msg, args, optlist, search):
-        """[--site <site>] [--no-mw-parsing] <search term>
+    @wrap([getopts({'lang': 'somethingWithoutSpaces'}), 'text'])
+    def wiki(self, irc, msg, args, optlist, title):
+        """<page title>

-        Returns the first paragraph of a wiki article. Optionally, a --site
-        argument can be given to override the default (usually Wikipedia) -
-        try using '--site lyrics.wikia.com' or '--site wiki.archlinux.org'.
-
-        If the --no-mw-parsing option is given, MediaWiki-specific parsing is
-        disabled. This has the following effects:
-          1) No attempt at searching for a relevant Wiki page is made, and
-             an article with the same name as the search term is directly
-             retrieved.
-          2) The plugin will retrieve the first <p> tag found on a page,
-             regardless of where it's found, and print it as text. This may
-             not work on all sites, as some use <p> for navbars and headings
-             as well.
+        Returns the first paragraph of a Wikipedia article.
        """
        optlist = dict(optlist)
-        baseurl = optlist.get('site') or self.registryValue('url', msg.args[0])
-        text = self._wiki(irc, msg, search, baseurl,
-                          use_mw_parsing=not optlist.get('no-mw-parsing'),
-                         )
+        lang = optlist.get('lang') or \
+            self.registryValue('wikipedia.lang', channel=msg.channel, network=irc.network)

-        irc.reply(text)
+        baseurl = f'https://{lang}.wikipedia.org/w/api.php'
+        self._wiki(irc, baseurl, title)

-    @internationalizeDocstring
-    @wrap([additional('somethingWithoutSpaces')])
-    def random(self, irc, msg, args, site):
-        """[<site>]
+    @wrap(['somethingWithoutSpaces', 'text'])
+    def fandom(self, irc, msg, args, wiki_subdomain, title):
+        """<wiki subdomain> <title>

-        Returns the first paragraph of a random wiki article. Optionally, the 'site'
-        argument can be given to override the default (usually Wikipedia)."""
-        baseurl = site or self.registryValue('url', msg.args[0])
-        text = self._wiki(irc, msg, 'Special:Random', baseurl)
+        Returns the first paragraph of a Fandom article.
+        """
+        baseurl = f'https://{wiki_subdomain}.fandom.com/api.php'
+        self._wiki(irc, baseurl, title)

-        irc.reply(text)

 Class = Wikifetch
-
-
-# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79:
--- a/Wikifetch/setup.py
+++ b/Wikifetch/setup.py
@ -3,4 +3,8 @@ from supybot.setup import plugin_setup

 plugin_setup(
    'Wikifetch',
+    install_requires=[
+        'bs4',
+        'mwparserfromhell',
+    ],
 )
--- a/Wikifetch/test.py
+++ b/Wikifetch/test.py
@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-
 ###
-# Copyright (c) 2010, quantumlemur
-# Copyright (c) 2011, Valentin Lorentz
-# Copyright (c) 2015,2017 James Lu <james@overdrivenetworks.com>
+# Copyright (c) 2023 James Lu
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@ -37,79 +35,27 @@ if network:
        plugins = ('Wikifetch',)

        def testWikipedia(self):
-            self.assertRegexp('wiki Monty Python',
-                              r'\x02Monty Python\x02.*?\.')
-            self.assertRegexp('wiki roegdfjpoepo',
-                              'Not found, or page malformed.*')
+            self.assertRegexp('wiki Vancouver',
+                              r'^Vancouver.*Canada')
+            self.assertRegexp('wiki Python (programming language)',
+                              'Python')
+            # Follow redirects
+            self.assertRegexp('wiki CYVR',
+                              'Vancouver International Airport')

-        def testStripInlineCitations(self):
-            self.assertNotRegexp('wiki UNICEF', '\[\d+\]')
+            # Display MW errors
+            self.assertRegexp('wiki NotFoundTest',
+                              "missingtitle - The page you specified doesn't exist")

-        def testIgnoreCoordinates(self):
-            # Articles for countries, cities, landmarks, etc. have GPS coordinates added to the top right.
-            # These should be ignored because we want to focus on the actual article text.
-            self.assertNotRegexp('wiki Canada', 'Coordinates\:')
-            self.assertNotRegexp('wiki Eiffel Tower', 'Coordinates\:')
-            self.assertNotRegexp('wiki Poland', 'Coordinates\:')
+        def testWikipediaLang(self):
+            self.assertRegexp('wiki --lang fr Paris', 'Paris.*capitale')

-        def testDisambig(self):
-            self.assertRegexp('wiki Python', 'is a disambiguation page.*'
-                              'Possible results include:.*?Pythonidae.*?;.*?;')
-            self.assertRegexp('wiki Fire (disambiguation)', '.*Possible results include:.*')
-
-        def testDisambigStripSpaces(self):
-            self.assertNotRegexp('wiki Na', '\n')
-
-        def testArticlesWithSymbolsInName(self):
-            self.assertNotError('wiki /')
-            self.assertNotError('wiki *')
-            self.assertNotError('wiki GNU/Linux')
-            self.assertNotError('wiki --site en.wikipedia.org /')
-
-        def testFollowRedirects(self):
-            self.assertRegexp('wiki YVR', 'Vancouver International Airport')
-
-        def testWikiBold(self):
-            self.assertRegexp('wiki Apple', '\x02')
-            # Complex combination of the <a> tag inside a <b> tag; we should always use
-            # empty bold content instead of the text "None".
-            self.assertNotRegexp('wiki Fallstreak hole', 'None')
-
-        def testWikiRandom(self):
-            self.assertNotError('random')
-
-        def testSiteCombinations(self):
-            self.assertNotError('wiki --site en.wikipedia.org Bread')
-            self.assertNotError('wiki --site https://en.wikipedia.org Bread')
-
-        def testNonEnglishWikipedia(self):
-            self.assertNotError('wiki --site fr.wikipedia.org Paris')
-            self.assertNotError('wiki --site de.wikipedia.org Berlin')
-            self.assertNotError('wiki --site zh.wikipedia.org 中文')
-            self.assertNotError('wiki --site ar.wikipedia.org 2017')
-
-    class Fandom(PluginTestCase):
-        plugins = ('Wikifetch',)
+            with conf.supybot.plugins.Wikifetch.Wikipedia.lang.context('zh'):
+                self.assertRegexp('wiki 地球', '地球.*太阳系')

        def testFandom(self):
-            self.assertNotError('wiki --site help.fandom.com Formatting')
+            self.assertRegexp('fandom minecraft Ender Dragon',
+                              r'[Ee]nder [Dd]ragon.*boss')

-    class ArchLinuxWiki(PluginTestCase):
-        plugins = ('Wikifetch',)
+            self.assertRegexp('fandom terraria Ocean', r'Ocean.*biome')

-        def testArchWiki(self):
-            self.assertNotError('wiki --site wiki.archlinux.org Bash')
-
-    class GentooWiki(PluginTestCase):
-        plugins = ('Wikifetch',)
-
-        def testGentooWiki(self):
-            self.assertNotError('wiki --site wiki.gentoo.org OpenRC')
-
-    class WikimediaSites(PluginTestCase):
-        plugins = ('Wikifetch',)
-
-        def testMediaWiki(self):
-            self.assertNotError('wiki --site mediawiki.org Sites using MediaWiki')
-
-# vim:set shiftwidth=4 tabstop=4 expandtab textwidth=79: