Wikifetch: add custom plain text translator by walking through Wikitext trees

2025-04-25 20:41:19 -05:00 · 2023-04-03 20:53:34 -07:00 · 2023-04-03 20:53:34 -07:00 · a5b453baf3
commit a5b453baf3
parent 32d9465b60
4 changed files with 190 additions and 28 deletions
--- a/Wikifetch/init.py
+++ b/Wikifetch/init.py
@ -40,27 +40,23 @@ import supybot.world as world
 # in here if you're keeping the plugin in CVS or some similar system.
 __version__ = "2022.02.01+git"

-__author__ = supybot.Author('quantumlemur', 'quantumlemur',
-        'quantumlemur@users.sourceforge.net')
-__maintainer__ = getattr(supybot.authors, 'jlu',
-                         supybot.Author('James Lu', 'jlu5', 'james@overdrivenetworks.com'))
+__author__ = getattr(supybot.authors, 'jlu',
+                     supybot.Author('James Lu', 'jlu5', 'james@overdrivenetworks.com'))

 # This is a dictionary mapping supybot.Author instances to lists of
 # contributions.
-__contributors__ = {supybot.authors.progval: ['enhance configurability',
-                                              'many bug fixes',
-                                              'internationalization'],
-                    __maintainer__:          ['formatting updates',
-                                              'multiple wiki support']}
+__contributors__ = {}

 __url__ = 'https://github.com/jlu5/SupyPlugins'

-from . import config
+from . import config, formatter
+# separate import line so that plugindownloader before 2022-06-23 doesn't run 2to3
 from . import plugin
-from imp import reload
-reload(plugin) # In case we're being reloaded.
-# Add more reloads here if you add third-party modules and want them to be
-# reloaded when this plugin is reloaded.  Don't forget to import them as well!
+
+from importlib import reload
+reload(config)
+reload(formatter)
+reload(plugin)

 if world.testing:
    from . import test
--- a/Wikifetch/formatter.py
+++ b/Wikifetch/formatter.py
@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+import argparse
+import re
+import sys
+
+import mwparserfromhell
+
+def fmt_node(w, summary=False):
+    s = ''
+    for node in w.nodes:
+        if isinstance(node, mwparserfromhell.nodes.text.Text):
+            text = str(node)
+        elif isinstance(node, mwparserfromhell.nodes.tag.Tag):
+            text = fmt_node(node.contents, summary=summary)
+        elif isinstance(node, mwparserfromhell.nodes.wikilink.Wikilink):
+            text = node.text or node.title
+            if ':' in node.title:
+                continue
+        elif isinstance(node, mwparserfromhell.nodes.external_link.ExternalLink):
+            text = node.title or node.url
+        else:
+            continue
+
+        if s or text.strip():
+            s += str(text)
+        if summary:
+            lines = s.lstrip().split('\n\n')
+            if len(lines) > 1:
+                s = lines[0]
+                break
+    return s.strip()
+
+_RE_EMPTY_PARENTHESES = re.compile(r' ?\(\s+\)')
+def _cleanup(text):
+    """Attempt to clean up text a bit further."""
+    text = re.sub(_RE_EMPTY_PARENTHESES, '', text)
+    return text
+
+def fmt(text, clean=True, **kwargs):
+    w = mwparserfromhell.parse(text)
+    output = fmt_node(w, **kwargs)
+    if clean:
+        output = _cleanup(output)
+    return output
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate plain text summaries from Wikitext input")
+    parser.add_argument('--no-summary', '-ns', action='store_true',
+        help='Return the whole page instead of just the first paragraph')
+    args = parser.parse_args()
+
+    result = fmt(sys.stdin.read(), summary=not args.no_summary)
+    print(result)
+
+if __name__ == '__main__':
+    main()
--- a/Wikifetch/plugin.py
+++ b/Wikifetch/plugin.py
@ -44,8 +44,9 @@ except:
    _ = lambda x:x
    internationalizeDocstring = lambda x:x

+from . import formatter
+
 from bs4 import BeautifulSoup
-import mwparserfromhell

 HEADERS = {
    'User-agent': 'Mozilla/5.0 (compatible; Supybot/Limnoria %s; Wikifetch plugin)' % conf.version
@ -61,7 +62,10 @@ class Wikifetch(callbacks.Plugin):

        self.log.debug('Wikifetch: fetching link %s', url)
        with utils.web.getUrlFd(url, headers=HEADERS) as fd:
-            api_data = json.load(fd)
+            try:
+                api_data = json.load(fd)
+            except json.JSONDecodeError as e:
+                raise callbacks.Error(f"JSON Decode Error on {url}: {e} - is this API URL correct?") from e

        if isinstance(api_data, dict):
            if error := api_data.get('error'):
@ -86,26 +90,20 @@ class Wikifetch(callbacks.Plugin):
        page_title = api_data['parse']['title']
        content = api_data['parse']['wikitext']
        html_head = api_data['parse']['headhtml']
-        mw = mwparserfromhell.parse(content)
-        for line in mw.strip_code().splitlines():
-            # Ignore stray image references that strip_code leaves behind
-            if re.search(r'\|?thumb\|', line):
-                continue
-            elif len(line) < 10:
-                continue
-            text = utils.str.normalizeWhitespace(line)
-            break
-        else:
-            raise callbacks.Error(f"No text paragraph found for page {page_title!r}")
+        text = formatter.fmt(content, summary=True)

        soup = BeautifulSoup(html_head, features="lxml")
-        url = ''
        if canonical_link := soup.find('link', rel='canonical'):
            # Wikipedia
            url = canonical_link.attrs['href']
        elif og_url := soup.find('meta', property='og:url'):
            # Fandom
            url = og_url.attrs['content']
+        else:
+            # Use generic MediaWiki link as fallback (this doesn't look as nice)
+            url = baseurl.replace('api.php', 'index.php?' + urllib.parse.urlencode({
+                'title': page_title
+            }))

        return (text, url)

--- a/Wikifetch/test.py
+++ b/Wikifetch/test.py
@ -27,9 +27,120 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 ###
+import unittest
+
+from . import formatter

 from supybot.test import *

+class WikifetchFormatterTest(unittest.TestCase):
+    def assertFormatEqual(self, wikitext, expected, **kwargs):
+        output = formatter.fmt(wikitext, **kwargs)
+        self.assertEqual(output, expected)
+
+    def test_basic(self):
+        self.assertFormatEqual('', '')
+        self.assertFormatEqual(
+            'The quick brown fox jumps over the lazy dog',
+            'The quick brown fox jumps over the lazy dog')
+
+    def test_format_tags(self):
+        self.assertFormatEqual(
+            "'''Lorem ipsum''' dolor sit amet, consectetur adipiscing elit",
+            "Lorem ipsum dolor sit amet, consectetur adipiscing elit")
+        self.assertFormatEqual(
+            "Test '''bold''' and ''italics'' and '''''both'''''.",
+            "Test bold and italics and both.")
+
+    def test_format_wikilinks(self):
+        self.assertFormatEqual(
+            "Abc [[def ghi]]", "Abc def ghi")
+        self.assertFormatEqual(
+            "Abc [[def|custom title]]  xyz", "Abc custom title  xyz")
+        # namespaced links get dropped
+        self.assertFormatEqual(
+            "hello world [[File:test.jpg]]", "hello world")
+        self.assertFormatEqual(
+            "[[Special:RecentChanges]]   [[en:Test]]", "")
+
+    def test_format_images(self):
+        self.assertFormatEqual(
+            "[[File:Foo.png|foo]]\nsome text",
+            "some text", summary=True)
+        self.assertFormatEqual(
+            "[[File:Foo.png|foo]]\n\nsome text again",
+            "some text again")
+
+        # Adapted from https://en.wikipedia.org/wiki/Wikipedia:Extended_image_syntax#Examples
+        self.assertFormatEqual("""text text text
+[[File:Westminstpalace.jpg|150px|alt=A large clock tower and other buildings line a great river.|The Palace of Westminster]]
+aa bb cc dd
+[[File:tst.png|100px|alt=Tiny globe|This is a globe.]]
+eee fff""", "text text text\n\naa bb cc dd\n\neee fff")
+        self.assertFormatEqual("""[[File:Westminstpalace.jpg|150px|alt=A large clock tower and other buildings line a great river.|The Palace of Westminster]]
+aa bb cc dd
+[[File:tst.png|100px|alt=Tiny globe|This is a globe.]]
+eee fff""", "aa bb cc dd", summary=True)
+
+    def test_format_external_links(self):
+        self.assertFormatEqual(
+            "first [http://example.com] last", "first http://example.com last")
+        self.assertFormatEqual(
+            "first [http://example.com second] last", "first second last")
+
+    def test_format_templates(self):
+        # Templates are ignored
+        self.assertFormatEqual(
+            "{{tmpl|arg=12345}}", "")
+        self.assertFormatEqual(
+            "{{tmpl2|foo=12345|bar=abcdefg}} test", "test")
+        self.assertFormatEqual(
+            "{{outer|{{inner test}}}}", "")
+        # mwparserfromhell usage example
+        self.assertFormatEqual(
+            "{{cleanup}} '''Foo''' is a [[bar]]. {{uncategorized}}",
+            "Foo is a bar.")
+
+    # multiline
+    def test_multiline(self):
+        self.assertFormatEqual(
+            "Hello world.\n\nThis is the second line.",
+            "Hello world.\n\nThis is the second line.")
+        self.assertFormatEqual(
+            "Hello world.\n\nThis is the second line.",
+            "Hello world.", summary=True)
+        self.assertFormatEqual(
+            "This sentence is on one\nline.\n\n2nd line",
+            "This sentence is on one\nline.", summary=True)
+
+        self.assertFormatEqual(
+            "\n\n\n    Leading spaces are dropped.\n\nThis is the second line.",
+            "Leading spaces are dropped.\n\nThis is the second line.")
+        self.assertFormatEqual(
+            "\n\n\n    Leading spaces are dropped.\n\nThis is the second line.",
+            "Leading spaces are dropped.", summary=True)
+
+    def test_multiline_drop_empty_lines(self):
+        # drop lines that are empty after filtering
+        # e.g. Arch Linux Wiki pages with cross-language links
+        self.assertFormatEqual(
+            "[[Category:abcd]]\n[[de:Test]]\n[[en:Test]]\n[[zh:Test]]\n{{Related articles start}}\n"
+            "Now the actual content starts\n1 2 3 4 5 6",
+            "Now the actual content starts\n1 2 3 4 5 6", summary=True)
+        self.assertFormatEqual(
+            "[[Category:abcd]]\n\n {{Related articles start}} \n\n[[Help:abcdef]]\n\n"
+            "Now the actual content starts\n\n1 2 3 4 5 6",
+            "Now the actual content starts", summary=True)
+
+    def test_cleanup(self):
+        # drop lines that are empty after filtering
+        # e.g. Arch Linux Wiki pages with cross-language links
+        empty_parens_after_filtering = """'''Vancouver''' ({{IPAc-en|audio=EN-Vancouver.ogg|v|æ|n|ˈ|k|uː|v|ər}}
+        {{respell|van|KOO|vər}}) is a major city in [[western Canada]],"""
+        self.assertFormatEqual(
+            empty_parens_after_filtering,
+            "Vancouver is a major city in western Canada,", summary=True)
+
 if network:
    class Wikipedia(PluginTestCase):
        plugins = ('Wikifetch',)