diff --git a/Wikifetch/__init__.py b/Wikifetch/__init__.py index 7c34fcc..66d94a2 100644 --- a/Wikifetch/__init__.py +++ b/Wikifetch/__init__.py @@ -40,27 +40,23 @@ import supybot.world as world # in here if you're keeping the plugin in CVS or some similar system. __version__ = "2022.02.01+git" -__author__ = supybot.Author('quantumlemur', 'quantumlemur', - 'quantumlemur@users.sourceforge.net') -__maintainer__ = getattr(supybot.authors, 'jlu', - supybot.Author('James Lu', 'jlu5', 'james@overdrivenetworks.com')) +__author__ = getattr(supybot.authors, 'jlu', + supybot.Author('James Lu', 'jlu5', 'james@overdrivenetworks.com')) # This is a dictionary mapping supybot.Author instances to lists of # contributions. -__contributors__ = {supybot.authors.progval: ['enhance configurability', - 'many bug fixes', - 'internationalization'], - __maintainer__: ['formatting updates', - 'multiple wiki support']} +__contributors__ = {} __url__ = 'https://github.com/jlu5/SupyPlugins' -from . import config +from . import config, formatter +# separate import line so that plugindownloader before 2022-06-23 doesn't run 2to3 from . import plugin -from imp import reload -reload(plugin) # In case we're being reloaded. -# Add more reloads here if you add third-party modules and want them to be -# reloaded when this plugin is reloaded. Don't forget to import them as well! + +from importlib import reload +reload(config) +reload(formatter) +reload(plugin) if world.testing: from . import test diff --git a/Wikifetch/formatter.py b/Wikifetch/formatter.py new file mode 100644 index 0000000..374efe3 --- /dev/null +++ b/Wikifetch/formatter.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +import argparse +import re +import sys + +import mwparserfromhell + +def fmt_node(w, summary=False): + s = '' + for node in w.nodes: + if isinstance(node, mwparserfromhell.nodes.text.Text): + text = str(node) + elif isinstance(node, mwparserfromhell.nodes.tag.Tag): + text = fmt_node(node.contents, summary=summary) + elif isinstance(node, mwparserfromhell.nodes.wikilink.Wikilink): + text = node.text or node.title + if ':' in node.title: + continue + elif isinstance(node, mwparserfromhell.nodes.external_link.ExternalLink): + text = node.title or node.url + else: + continue + + if s or text.strip(): + s += str(text) + if summary: + lines = s.lstrip().split('\n\n') + if len(lines) > 1: + s = lines[0] + break + return s.strip() + +_RE_EMPTY_PARENTHESES = re.compile(r' ?\(\s+\)') +def _cleanup(text): + """Attempt to clean up text a bit further.""" + text = re.sub(_RE_EMPTY_PARENTHESES, '', text) + return text + +def fmt(text, clean=True, **kwargs): + w = mwparserfromhell.parse(text) + output = fmt_node(w, **kwargs) + if clean: + output = _cleanup(output) + return output + +def main(): + parser = argparse.ArgumentParser( + description="Generate plain text summaries from Wikitext input") + parser.add_argument('--no-summary', '-ns', action='store_true', + help='Return the whole page instead of just the first paragraph') + args = parser.parse_args() + + result = fmt(sys.stdin.read(), summary=not args.no_summary) + print(result) + +if __name__ == '__main__': + main() diff --git a/Wikifetch/plugin.py b/Wikifetch/plugin.py index d85c8cf..146539d 100644 --- a/Wikifetch/plugin.py +++ b/Wikifetch/plugin.py @@ -44,8 +44,9 @@ except: _ = lambda x:x internationalizeDocstring = lambda x:x +from . import formatter + from bs4 import BeautifulSoup -import mwparserfromhell HEADERS = { 'User-agent': 'Mozilla/5.0 (compatible; Supybot/Limnoria %s; Wikifetch plugin)' % conf.version @@ -61,7 +62,10 @@ class Wikifetch(callbacks.Plugin): self.log.debug('Wikifetch: fetching link %s', url) with utils.web.getUrlFd(url, headers=HEADERS) as fd: - api_data = json.load(fd) + try: + api_data = json.load(fd) + except json.JSONDecodeError as e: + raise callbacks.Error(f"JSON Decode Error on {url}: {e} - is this API URL correct?") from e if isinstance(api_data, dict): if error := api_data.get('error'): @@ -86,26 +90,20 @@ class Wikifetch(callbacks.Plugin): page_title = api_data['parse']['title'] content = api_data['parse']['wikitext'] html_head = api_data['parse']['headhtml'] - mw = mwparserfromhell.parse(content) - for line in mw.strip_code().splitlines(): - # Ignore stray image references that strip_code leaves behind - if re.search(r'\|?thumb\|', line): - continue - elif len(line) < 10: - continue - text = utils.str.normalizeWhitespace(line) - break - else: - raise callbacks.Error(f"No text paragraph found for page {page_title!r}") + text = formatter.fmt(content, summary=True) soup = BeautifulSoup(html_head, features="lxml") - url = '' if canonical_link := soup.find('link', rel='canonical'): # Wikipedia url = canonical_link.attrs['href'] elif og_url := soup.find('meta', property='og:url'): # Fandom url = og_url.attrs['content'] + else: + # Use generic MediaWiki link as fallback (this doesn't look as nice) + url = baseurl.replace('api.php', 'index.php?' + urllib.parse.urlencode({ + 'title': page_title + })) return (text, url) diff --git a/Wikifetch/test.py b/Wikifetch/test.py index a260967..5d7ea3e 100644 --- a/Wikifetch/test.py +++ b/Wikifetch/test.py @@ -27,9 +27,120 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. ### +import unittest + +from . import formatter from supybot.test import * +class WikifetchFormatterTest(unittest.TestCase): + def assertFormatEqual(self, wikitext, expected, **kwargs): + output = formatter.fmt(wikitext, **kwargs) + self.assertEqual(output, expected) + + def test_basic(self): + self.assertFormatEqual('', '') + self.assertFormatEqual( + 'The quick brown fox jumps over the lazy dog', + 'The quick brown fox jumps over the lazy dog') + + def test_format_tags(self): + self.assertFormatEqual( + "'''Lorem ipsum''' dolor sit amet, consectetur adipiscing elit", + "Lorem ipsum dolor sit amet, consectetur adipiscing elit") + self.assertFormatEqual( + "Test '''bold''' and ''italics'' and '''''both'''''.", + "Test bold and italics and both.") + + def test_format_wikilinks(self): + self.assertFormatEqual( + "Abc [[def ghi]]", "Abc def ghi") + self.assertFormatEqual( + "Abc [[def|custom title]] xyz", "Abc custom title xyz") + # namespaced links get dropped + self.assertFormatEqual( + "hello world [[File:test.jpg]]", "hello world") + self.assertFormatEqual( + "[[Special:RecentChanges]] [[en:Test]]", "") + + def test_format_images(self): + self.assertFormatEqual( + "[[File:Foo.png|foo]]\nsome text", + "some text", summary=True) + self.assertFormatEqual( + "[[File:Foo.png|foo]]\n\nsome text again", + "some text again") + + # Adapted from https://en.wikipedia.org/wiki/Wikipedia:Extended_image_syntax#Examples + self.assertFormatEqual("""text text text +[[File:Westminstpalace.jpg|150px|alt=A large clock tower and other buildings line a great river.|The Palace of Westminster]] +aa bb cc dd +[[File:tst.png|100px|alt=Tiny globe|This is a globe.]] +eee fff""", "text text text\n\naa bb cc dd\n\neee fff") + self.assertFormatEqual("""[[File:Westminstpalace.jpg|150px|alt=A large clock tower and other buildings line a great river.|The Palace of Westminster]] +aa bb cc dd +[[File:tst.png|100px|alt=Tiny globe|This is a globe.]] +eee fff""", "aa bb cc dd", summary=True) + + def test_format_external_links(self): + self.assertFormatEqual( + "first [http://example.com] last", "first http://example.com last") + self.assertFormatEqual( + "first [http://example.com second] last", "first second last") + + def test_format_templates(self): + # Templates are ignored + self.assertFormatEqual( + "{{tmpl|arg=12345}}", "") + self.assertFormatEqual( + "{{tmpl2|foo=12345|bar=abcdefg}} test", "test") + self.assertFormatEqual( + "{{outer|{{inner test}}}}", "") + # mwparserfromhell usage example + self.assertFormatEqual( + "{{cleanup}} '''Foo''' is a [[bar]]. {{uncategorized}}", + "Foo is a bar.") + + # multiline + def test_multiline(self): + self.assertFormatEqual( + "Hello world.\n\nThis is the second line.", + "Hello world.\n\nThis is the second line.") + self.assertFormatEqual( + "Hello world.\n\nThis is the second line.", + "Hello world.", summary=True) + self.assertFormatEqual( + "This sentence is on one\nline.\n\n2nd line", + "This sentence is on one\nline.", summary=True) + + self.assertFormatEqual( + "\n\n\n Leading spaces are dropped.\n\nThis is the second line.", + "Leading spaces are dropped.\n\nThis is the second line.") + self.assertFormatEqual( + "\n\n\n Leading spaces are dropped.\n\nThis is the second line.", + "Leading spaces are dropped.", summary=True) + + def test_multiline_drop_empty_lines(self): + # drop lines that are empty after filtering + # e.g. Arch Linux Wiki pages with cross-language links + self.assertFormatEqual( + "[[Category:abcd]]\n[[de:Test]]\n[[en:Test]]\n[[zh:Test]]\n{{Related articles start}}\n" + "Now the actual content starts\n1 2 3 4 5 6", + "Now the actual content starts\n1 2 3 4 5 6", summary=True) + self.assertFormatEqual( + "[[Category:abcd]]\n\n {{Related articles start}} \n\n[[Help:abcdef]]\n\n" + "Now the actual content starts\n\n1 2 3 4 5 6", + "Now the actual content starts", summary=True) + + def test_cleanup(self): + # drop lines that are empty after filtering + # e.g. Arch Linux Wiki pages with cross-language links + empty_parens_after_filtering = """'''Vancouver''' ({{IPAc-en|audio=EN-Vancouver.ogg|v|æ|n|ˈ|k|uː|v|ər}} + {{respell|van|KOO|vər}}) is a major city in [[western Canada]],""" + self.assertFormatEqual( + empty_parens_after_filtering, + "Vancouver is a major city in western Canada,", summary=True) + if network: class Wikipedia(PluginTestCase): plugins = ('Wikifetch',)