diff --git a/README.md b/README.md index 8c96c94..4a0f683 100644 --- a/README.md +++ b/README.md @@ -77,5 +77,5 @@ Most of these plugins also have their own READMEs in their folders; you can usua - Translates text through Google Translate multiple times in order to get amusing results. ### Wikifetch -- Fork of [ProgVal's Wikipedia plugin](https://github.com/ProgVal/Supybot-plugins), with support for other wikis (via a `--site` option) and other improvements. - - **Requires:** [lxml](https://lxml.de/installation.html) +- Fetch content from MediaWiki-powered sites (Wikipedia, Fandom) + - **Requires:** [Beautiful Soup 4](http://www.crummy.com/software/BeautifulSoup/bs4/doc/), [mwparserfromhell](https://mwparserfromhell.readthedocs.io/) diff --git a/Wikifetch/config.py b/Wikifetch/config.py index 1fb67d5..dee5552 100644 --- a/Wikifetch/config.py +++ b/Wikifetch/config.py @@ -1,7 +1,5 @@ ### -# Copyright (c) 2010, quantumlemur -# Copyright (c) 2011, Valentin Lorentz -# Copyright (c) 2015, James Lu +# Copyright (c) 2023 James Lu # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -30,11 +28,9 @@ ### -import supybot.conf as conf -import supybot.registry as registry +from supybot import conf, registry try: - from supybot.i18n import PluginInternationalization - from supybot.i18n import internationalizeDocstring + from supybot.i18n import PluginInternationalization, internationalizeDocstring _ = PluginInternationalization('Wikifetch') except: # This are useless functions that's allow to run the plugin on a bot @@ -50,10 +46,8 @@ def configure(advanced): from supybot.questions import expect, anything, something, yn conf.registerPlugin('Wikifetch', True) - Wikifetch = conf.registerPlugin('Wikifetch') -conf.registerChannelValue(Wikifetch, 'url', - registry.String(_('en.wikipedia.org'), _("""Default URL of the - website to pull from."""))) -# vim:set shiftwidth=4 tabstop=4 expandtab textwidth=79: +conf.registerGroup(Wikifetch, 'wikipedia') +conf.registerChannelValue(Wikifetch.wikipedia, 'lang', + registry.String('en', _("""Default Wikipedia language"""))) diff --git a/Wikifetch/plugin.py b/Wikifetch/plugin.py index 477fa71..4ddc0fa 100644 --- a/Wikifetch/plugin.py +++ b/Wikifetch/plugin.py @@ -1,7 +1,7 @@ ### # Copyright (c) 2010, quantumlemur # Copyright (c) 2011, Valentin Lorentz -# Copyright (c) 2015,2017 James Lu +# Copyright (c) 2015-2023 James Lu # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,17 +29,14 @@ # POSSIBILITY OF SUCH DAMAGE. ### +import json import re -import sys -import lxml.html -import supybot.utils as utils +import urllib.parse + +from supybot import callbacks, conf, ircutils, plugins, utils from supybot.commands import wrap, getopts, additional -import supybot.plugins as plugins -import supybot.ircutils as ircutils -import supybot.callbacks as callbacks try: - from supybot.i18n import PluginInternationalization - from supybot.i18n import internationalizeDocstring + from supybot.i18n import PluginInternationalization, internationalizeDocstring _ = PluginInternationalization('Wikifetch') except: # This are useless functions that's allow to run the plugin on a bot @@ -47,238 +44,92 @@ except: _ = lambda x:x internationalizeDocstring = lambda x:x -if sys.version_info[0] < 3: - raise ImportError('This plugin requires Python 3. For a legacy version of this plugin that still ' - 'supports Python 2, consult the python2-legacy branch at ' - 'https://github.com/jlu5/SupyPlugins/tree/python2-legacy') -from urllib.parse import quote_plus +from bs4 import BeautifulSoup +import mwparserfromhell + +HEADERS = { + 'User-agent': 'Mozilla/5.0 (compatible; Supybot/Limnoria %s; Wikifetch plugin)' % conf.version +} class Wikifetch(callbacks.Plugin): """Grabs data from Wikipedia and other MediaWiki-powered sites.""" threaded = True - # This defines a series of suffixes this should be added after the domain name. - SPECIAL_URLS = {'wikia.com': '/wiki', - 'wikipedia.org': '/wiki', - 'wiki.archlinux.org': '/index.php', - 'wiki.gentoo.org': '/wiki', - 'mediawiki.org': '/wiki', - 'wikimedia.org': '/wiki', - } + def _mediawiki_fetch(self, baseurl, title): + params = urllib.parse.urlencode({ + 'action': 'parse', + 'page': title, + 'prop': 'wikitext|headhtml', + 'formatversion': 2, + 'format': 'json', + 'redirects': True + }) + url = f"{baseurl}?{params}" - def _get_article_tree(self, baseurl, query, use_mw_parsing=True): - """ - Returns a wiki article tree given the base URL and search query. baseurl can be None, - in which case, searching is skipped and the search query will be treated as a raw address. - """ + self.log.debug('Wikifetch: fetching link %s', url) + with utils.web.getUrlFd(url, headers=HEADERS) as fd: + api_data = json.load(fd) - if baseurl is None: - addr = query + if error := api_data.get('error'): + error_code = error['code'] + error_info = error['info'] + raise callbacks.Error(f"MediaWiki API Error: {error_code} - {error_info} - {url}") + + page_title = api_data['parse']['title'] + content = api_data['parse']['wikitext'] + html_head = api_data['parse']['headhtml'] + mw = mwparserfromhell.parse(content) + for line in mw.strip_code().splitlines(): + # Ignore stray image references that strip_code leaves behind + if re.search(r'\|?thumb\|', line): + continue + elif len(line) < 10: + continue + text = utils.str.normalizeWhitespace(line) + break else: - # Different instances of MediaWiki use different URLs... This tries - # to make the parser work for most sites, but still use resonable defaults - # such as filling in http:// and appending /wiki to links... - baseurl = baseurl.lower() - for match, suffix in self.SPECIAL_URLS.items(): - if match in baseurl: - baseurl += suffix - break + raise callbacks.Error(f"No text paragraph found for page {page_title!r}") - # Add http:// to the URL if a scheme isn't specified - if not baseurl.startswith(('http://', 'https://')): - baseurl = 'http://' + baseurl + soup = BeautifulSoup(html_head, features="lxml") + url = '' + if canonical_link := soup.find('link', rel='canonical'): + # Wikipedia + url = canonical_link.attrs['href'] + elif og_url := soup.find('meta', property='og:url'): + # Fandom + url = og_url.attrs['content'] - if use_mw_parsing: - # first, we get the page - addr = '%s/Special:Search?search=%s' % \ - (baseurl, quote_plus(query)) - else: - addr = '%s/%s' % (baseurl, query) + return (text, url) - self.log.debug('Wikifetch: using URL %s', addr) - - try: - article = utils.web.getUrl(addr, timeout=3) - except utils.web.Error: - self.log.exception('Failed to fetch link %s', addr) - raise - - article = article.decode() - - tree = lxml.html.document_fromstring(article) - return (tree, article, addr) - - def _wiki(self, irc, msg, search, baseurl, use_mw_parsing=True): - """Fetches and replies content from a MediaWiki-powered website.""" - reply = '' - - # First, fetch and parse the page - tree, article, addr = self._get_article_tree(baseurl, search, use_mw_parsing=use_mw_parsing) - - # check if it gives a "Did you mean..." redirect - didyoumean = tree.xpath('//div[@class="searchdidyoumean"]/a' - '[@title="Special:Search"]') - if didyoumean: - redirect = didyoumean[0].text_content().strip() - if self.registryValue('showRedirects', msg.args[0]): - reply += _('I didn\'t find anything for "%s". ' - 'Did you mean "%s"? ') % (search, redirect) - - tree, article, addr = self._get_article_tree(baseurl, didyoumean[0].get('href')) - search = redirect - - # check if it's a page of search results (rather than an article), and - # if so, retrieve the first result - searchresults = tree.xpath('//div[@class="searchresults"]/ul/li/a') or \ - tree.xpath('//article/ul/li/a') # Special case for Wikia (2017-01-27) - self.log.debug('Wikifetch: got search results %s', searchresults) - - if searchresults: - redirect = searchresults[0].text_content().strip() - if self.registryValue('showRedirects', msg.args[0]): - reply += _('I didn\'t find anything for "%s", but here\'s the ' - 'result for "%s": ') % (search, redirect) - # Follow the search result and fetch that article. Note: use the original - # base url to prevent prefixes like "/wiki" from being added twice. - self.log.debug('Wikifetch: following search result:') - tree, article, addr = self._get_article_tree(None, searchresults[0].get('href')) - search = redirect - - # extract the address we got it from - most sites have the perm link - # inside the page itself - try: - addr = tree.find(".//link[@rel='canonical']").attrib['href'] - except (ValueError, AttributeError): - self.log.debug('Wikifetch: failed link extraction, skipping') - try: - addr = tree.find(".//div[@class='printfooter']/a").attrib['href'] - addr = re.sub(r'([&?]|(amp;)?)oldid=\d+$', '', addr) - except (ValueError, AttributeError): - self.log.debug('Wikifetch: failed printfooter link extraction, skipping') - # If any of the above post-processing tricks fail, just ignore - pass - - text_content = tree - if use_mw_parsing: - text_content = tree.xpath("//div[@class='mw-parser-output']") or tree.xpath("//div[@id='mw-content-text']") - if text_content: - text_content = text_content[0] - self.log.debug('Wikifetch: Using %s as text_content', text_content) - - # check if it's a disambiguation page - disambig = tree.xpath('//table[@id="disambigbox"]') or \ - tree.xpath('//table[@id="setindexbox"]') or \ - tree.xpath('//div[contains(@class, "disambig")]') # Wikia (2017-01-27) - if disambig: - reply += format(_('%u is a disambiguation page. '), addr) - disambig = text_content.xpath('./ul/li') - - disambig_results = [] - for item in disambig: - for link in item.findall('a'): - if link.text is not None: - # Hackishly bold all tags - link.text = "%s" % link.text - item = item.text_content().replace('', '\x02') - # Normalize and strip whitespace, to prevent newlines and such - # from corrupting the display. - item = utils.str.normalizeWhitespace(item).strip() - disambig_results.append(item) - if disambig_results: - reply += format(_('Possible results include: %s'), '; '.join(disambig_results)) - - # Catch talk pages - elif 'ns-talk' in tree.find("body").attrib.get('class', ''): - reply += format(_('This article appears to be a talk page: %u'), addr) + def _wiki(self, irc, baseurl, title): + text, url = self._mediawiki_fetch(baseurl, title) + if url: + irc.reply(utils.str.format("%s - %u", text, url)) else: - # Get the first paragraph as text. - paragraphs = [] - for p in text_content.xpath("./p"): - self.log.debug('Wikifetch: looking at paragraph %s', p.text_content()) - - # Skip geographic coordinates, e.g. on articles for countries - if p.xpath(".//span[@class='geo-dec']"): - continue - # 2018-07-19: some articles have an empty p tag with this class and no content (why?) - elif 'mw-empty-elt' in p.attrib.get('class', ''): - continue - # Skip

tags with no content, for obvious reasons - elif not p.text_content().strip(): - continue - - paragraphs.append(p) - - if (not paragraphs) or 'wiki/Special:Search' in addr: - if 'wikipedia:wikiproject' in addr.lower(): - reply += format(_('This page appears to be a WikiProject page, ' - 'but it is too complex for us to parse: %u'), addr) - else: - irc.error(_('Not found, or page malformed.'), Raise=True) - else: - p = paragraphs[0] - # Replace tags with IRC-style bold, this has to be - # done indirectly because unescaped '\x02' is invalid in XML - for b_tag in p.xpath('//b'): - b_tag.text = "%s" % (b_tag.text or '') - p = p.text_content() - p = p.replace('', '\x02') - # Get rid of newlines, etc., that can corrupt the output. - p = utils.str.normalizeWhitespace(p) - p = p.strip() - - if not p: - reply = _('') - - reply += format('%s %s %u', p, _('Retrieved from'), addr) - reply = reply.replace('&','&') - - # Remove inline citations (text[1][2][3]) as well as inline notes (text[note 1]). - reply = re.sub(r'\[[a-z ]*?\d+\]', '', reply) - - return reply + irc.reply(text) @internationalizeDocstring - @wrap([getopts({'site': 'somethingWithoutSpaces', - 'no-mw-parsing': ''}), - 'text']) - def wiki(self, irc, msg, args, optlist, search): - """[--site ] [--no-mw-parsing] + @wrap([getopts({'lang': 'somethingWithoutSpaces'}), 'text']) + def wiki(self, irc, msg, args, optlist, title): + """ - Returns the first paragraph of a wiki article. Optionally, a --site - argument can be given to override the default (usually Wikipedia) - - try using '--site lyrics.wikia.com' or '--site wiki.archlinux.org'. - - If the --no-mw-parsing option is given, MediaWiki-specific parsing is - disabled. This has the following effects: - 1) No attempt at searching for a relevant Wiki page is made, and - an article with the same name as the search term is directly - retrieved. - 2) The plugin will retrieve the first

tag found on a page, - regardless of where it's found, and print it as text. This may - not work on all sites, as some use

for navbars and headings - as well. + Returns the first paragraph of a Wikipedia article. """ optlist = dict(optlist) - baseurl = optlist.get('site') or self.registryValue('url', msg.args[0]) - text = self._wiki(irc, msg, search, baseurl, - use_mw_parsing=not optlist.get('no-mw-parsing'), - ) + lang = optlist.get('lang') or \ + self.registryValue('wikipedia.lang', channel=msg.channel, network=irc.network) - irc.reply(text) + baseurl = f'https://{lang}.wikipedia.org/w/api.php' + self._wiki(irc, baseurl, title) - @internationalizeDocstring - @wrap([additional('somethingWithoutSpaces')]) - def random(self, irc, msg, args, site): - """[] + @wrap(['somethingWithoutSpaces', 'text']) + def fandom(self, irc, msg, args, wiki_subdomain, title): + """ - Returns the first paragraph of a random wiki article. Optionally, the 'site' - argument can be given to override the default (usually Wikipedia).""" - baseurl = site or self.registryValue('url', msg.args[0]) - text = self._wiki(irc, msg, 'Special:Random', baseurl) + Returns the first paragraph of a Fandom article. + """ + baseurl = f'https://{wiki_subdomain}.fandom.com/api.php' + self._wiki(irc, baseurl, title) - irc.reply(text) Class = Wikifetch - - -# vim:set shiftwidth=4 softtabstop=4 expandtab textwidth=79: diff --git a/Wikifetch/setup.py b/Wikifetch/setup.py index a27aff6..7c64e6f 100644 --- a/Wikifetch/setup.py +++ b/Wikifetch/setup.py @@ -3,4 +3,8 @@ from supybot.setup import plugin_setup plugin_setup( 'Wikifetch', + install_requires=[ + 'bs4', + 'mwparserfromhell', + ], ) diff --git a/Wikifetch/test.py b/Wikifetch/test.py index c245dbf..6e0c466 100644 --- a/Wikifetch/test.py +++ b/Wikifetch/test.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- ### -# Copyright (c) 2010, quantumlemur -# Copyright (c) 2011, Valentin Lorentz -# Copyright (c) 2015,2017 James Lu <james@overdrivenetworks.com> +# Copyright (c) 2023 James Lu # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,79 +35,27 @@ if network: plugins = ('Wikifetch',) def testWikipedia(self): - self.assertRegexp('wiki Monty Python', - r'\x02Monty Python\x02.*?\.') - self.assertRegexp('wiki roegdfjpoepo', - 'Not found, or page malformed.*') + self.assertRegexp('wiki Vancouver', + r'^Vancouver.*Canada') + self.assertRegexp('wiki Python (programming language)', + 'Python') + # Follow redirects + self.assertRegexp('wiki CYVR', + 'Vancouver International Airport') - def testStripInlineCitations(self): - self.assertNotRegexp('wiki UNICEF', '\[\d+\]') + # Display MW errors + self.assertRegexp('wiki NotFoundTest', + "missingtitle - The page you specified doesn't exist") - def testIgnoreCoordinates(self): - # Articles for countries, cities, landmarks, etc. have GPS coordinates added to the top right. - # These should be ignored because we want to focus on the actual article text. - self.assertNotRegexp('wiki Canada', 'Coordinates\:') - self.assertNotRegexp('wiki Eiffel Tower', 'Coordinates\:') - self.assertNotRegexp('wiki Poland', 'Coordinates\:') + def testWikipediaLang(self): + self.assertRegexp('wiki --lang fr Paris', 'Paris.*capitale') - def testDisambig(self): - self.assertRegexp('wiki Python', 'is a disambiguation page.*' - 'Possible results include:.*?Pythonidae.*?;.*?;') - self.assertRegexp('wiki Fire (disambiguation)', '.*Possible results include:.*') - - def testDisambigStripSpaces(self): - self.assertNotRegexp('wiki Na', '\n') - - def testArticlesWithSymbolsInName(self): - self.assertNotError('wiki /') - self.assertNotError('wiki *') - self.assertNotError('wiki GNU/Linux') - self.assertNotError('wiki --site en.wikipedia.org /') - - def testFollowRedirects(self): - self.assertRegexp('wiki YVR', 'Vancouver International Airport') - - def testWikiBold(self): - self.assertRegexp('wiki Apple', '\x02') - # Complex combination of the <a> tag inside a <b> tag; we should always use - # empty bold content instead of the text "None". - self.assertNotRegexp('wiki Fallstreak hole', 'None') - - def testWikiRandom(self): - self.assertNotError('random') - - def testSiteCombinations(self): - self.assertNotError('wiki --site en.wikipedia.org Bread') - self.assertNotError('wiki --site https://en.wikipedia.org Bread') - - def testNonEnglishWikipedia(self): - self.assertNotError('wiki --site fr.wikipedia.org Paris') - self.assertNotError('wiki --site de.wikipedia.org Berlin') - self.assertNotError('wiki --site zh.wikipedia.org 中文') - self.assertNotError('wiki --site ar.wikipedia.org 2017') - - class Fandom(PluginTestCase): - plugins = ('Wikifetch',) + with conf.supybot.plugins.Wikifetch.Wikipedia.lang.context('zh'): + self.assertRegexp('wiki 地球', '地球.*太阳系') def testFandom(self): - self.assertNotError('wiki --site help.fandom.com Formatting') + self.assertRegexp('fandom minecraft Ender Dragon', + r'[Ee]nder [Dd]ragon.*boss') - class ArchLinuxWiki(PluginTestCase): - plugins = ('Wikifetch',) + self.assertRegexp('fandom terraria Ocean', r'Ocean.*biome') - def testArchWiki(self): - self.assertNotError('wiki --site wiki.archlinux.org Bash') - - class GentooWiki(PluginTestCase): - plugins = ('Wikifetch',) - - def testGentooWiki(self): - self.assertNotError('wiki --site wiki.gentoo.org OpenRC') - - class WikimediaSites(PluginTestCase): - plugins = ('Wikifetch',) - - def testMediaWiki(self): - self.assertNotError('wiki --site mediawiki.org Sites using MediaWiki') - -# vim:set shiftwidth=4 tabstop=4 expandtab textwidth=79: