mirror of
https://github.com/jlu5/SupyPlugins.git
synced 2025-04-25 20:41:19 -05:00
Wikifetch: add custom plain text translator by walking through Wikitext trees
This commit is contained in:
parent
32d9465b60
commit
a5b453baf3
@ -40,27 +40,23 @@ import supybot.world as world
|
||||
# in here if you're keeping the plugin in CVS or some similar system.
|
||||
__version__ = "2022.02.01+git"
|
||||
|
||||
__author__ = supybot.Author('quantumlemur', 'quantumlemur',
|
||||
'quantumlemur@users.sourceforge.net')
|
||||
__maintainer__ = getattr(supybot.authors, 'jlu',
|
||||
supybot.Author('James Lu', 'jlu5', 'james@overdrivenetworks.com'))
|
||||
__author__ = getattr(supybot.authors, 'jlu',
|
||||
supybot.Author('James Lu', 'jlu5', 'james@overdrivenetworks.com'))
|
||||
|
||||
# This is a dictionary mapping supybot.Author instances to lists of
|
||||
# contributions.
|
||||
__contributors__ = {supybot.authors.progval: ['enhance configurability',
|
||||
'many bug fixes',
|
||||
'internationalization'],
|
||||
__maintainer__: ['formatting updates',
|
||||
'multiple wiki support']}
|
||||
__contributors__ = {}
|
||||
|
||||
__url__ = 'https://github.com/jlu5/SupyPlugins'
|
||||
|
||||
from . import config
|
||||
from . import config, formatter
|
||||
# separate import line so that plugindownloader before 2022-06-23 doesn't run 2to3
|
||||
from . import plugin
|
||||
from imp import reload
|
||||
reload(plugin) # In case we're being reloaded.
|
||||
# Add more reloads here if you add third-party modules and want them to be
|
||||
# reloaded when this plugin is reloaded. Don't forget to import them as well!
|
||||
|
||||
from importlib import reload
|
||||
reload(config)
|
||||
reload(formatter)
|
||||
reload(plugin)
|
||||
|
||||
if world.testing:
|
||||
from . import test
|
||||
|
57
Wikifetch/formatter.py
Normal file
57
Wikifetch/formatter.py
Normal file
@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import re
|
||||
import sys
|
||||
|
||||
import mwparserfromhell
|
||||
|
||||
def fmt_node(w, summary=False):
|
||||
s = ''
|
||||
for node in w.nodes:
|
||||
if isinstance(node, mwparserfromhell.nodes.text.Text):
|
||||
text = str(node)
|
||||
elif isinstance(node, mwparserfromhell.nodes.tag.Tag):
|
||||
text = fmt_node(node.contents, summary=summary)
|
||||
elif isinstance(node, mwparserfromhell.nodes.wikilink.Wikilink):
|
||||
text = node.text or node.title
|
||||
if ':' in node.title:
|
||||
continue
|
||||
elif isinstance(node, mwparserfromhell.nodes.external_link.ExternalLink):
|
||||
text = node.title or node.url
|
||||
else:
|
||||
continue
|
||||
|
||||
if s or text.strip():
|
||||
s += str(text)
|
||||
if summary:
|
||||
lines = s.lstrip().split('\n\n')
|
||||
if len(lines) > 1:
|
||||
s = lines[0]
|
||||
break
|
||||
return s.strip()
|
||||
|
||||
_RE_EMPTY_PARENTHESES = re.compile(r' ?\(\s+\)')
|
||||
def _cleanup(text):
|
||||
"""Attempt to clean up text a bit further."""
|
||||
text = re.sub(_RE_EMPTY_PARENTHESES, '', text)
|
||||
return text
|
||||
|
||||
def fmt(text, clean=True, **kwargs):
|
||||
w = mwparserfromhell.parse(text)
|
||||
output = fmt_node(w, **kwargs)
|
||||
if clean:
|
||||
output = _cleanup(output)
|
||||
return output
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate plain text summaries from Wikitext input")
|
||||
parser.add_argument('--no-summary', '-ns', action='store_true',
|
||||
help='Return the whole page instead of just the first paragraph')
|
||||
args = parser.parse_args()
|
||||
|
||||
result = fmt(sys.stdin.read(), summary=not args.no_summary)
|
||||
print(result)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -44,8 +44,9 @@ except:
|
||||
_ = lambda x:x
|
||||
internationalizeDocstring = lambda x:x
|
||||
|
||||
from . import formatter
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import mwparserfromhell
|
||||
|
||||
HEADERS = {
|
||||
'User-agent': 'Mozilla/5.0 (compatible; Supybot/Limnoria %s; Wikifetch plugin)' % conf.version
|
||||
@ -61,7 +62,10 @@ class Wikifetch(callbacks.Plugin):
|
||||
|
||||
self.log.debug('Wikifetch: fetching link %s', url)
|
||||
with utils.web.getUrlFd(url, headers=HEADERS) as fd:
|
||||
api_data = json.load(fd)
|
||||
try:
|
||||
api_data = json.load(fd)
|
||||
except json.JSONDecodeError as e:
|
||||
raise callbacks.Error(f"JSON Decode Error on {url}: {e} - is this API URL correct?") from e
|
||||
|
||||
if isinstance(api_data, dict):
|
||||
if error := api_data.get('error'):
|
||||
@ -86,26 +90,20 @@ class Wikifetch(callbacks.Plugin):
|
||||
page_title = api_data['parse']['title']
|
||||
content = api_data['parse']['wikitext']
|
||||
html_head = api_data['parse']['headhtml']
|
||||
mw = mwparserfromhell.parse(content)
|
||||
for line in mw.strip_code().splitlines():
|
||||
# Ignore stray image references that strip_code leaves behind
|
||||
if re.search(r'\|?thumb\|', line):
|
||||
continue
|
||||
elif len(line) < 10:
|
||||
continue
|
||||
text = utils.str.normalizeWhitespace(line)
|
||||
break
|
||||
else:
|
||||
raise callbacks.Error(f"No text paragraph found for page {page_title!r}")
|
||||
text = formatter.fmt(content, summary=True)
|
||||
|
||||
soup = BeautifulSoup(html_head, features="lxml")
|
||||
url = ''
|
||||
if canonical_link := soup.find('link', rel='canonical'):
|
||||
# Wikipedia
|
||||
url = canonical_link.attrs['href']
|
||||
elif og_url := soup.find('meta', property='og:url'):
|
||||
# Fandom
|
||||
url = og_url.attrs['content']
|
||||
else:
|
||||
# Use generic MediaWiki link as fallback (this doesn't look as nice)
|
||||
url = baseurl.replace('api.php', 'index.php?' + urllib.parse.urlencode({
|
||||
'title': page_title
|
||||
}))
|
||||
|
||||
return (text, url)
|
||||
|
||||
|
@ -27,9 +27,120 @@
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
###
|
||||
import unittest
|
||||
|
||||
from . import formatter
|
||||
|
||||
from supybot.test import *
|
||||
|
||||
class WikifetchFormatterTest(unittest.TestCase):
|
||||
def assertFormatEqual(self, wikitext, expected, **kwargs):
|
||||
output = formatter.fmt(wikitext, **kwargs)
|
||||
self.assertEqual(output, expected)
|
||||
|
||||
def test_basic(self):
|
||||
self.assertFormatEqual('', '')
|
||||
self.assertFormatEqual(
|
||||
'The quick brown fox jumps over the lazy dog',
|
||||
'The quick brown fox jumps over the lazy dog')
|
||||
|
||||
def test_format_tags(self):
|
||||
self.assertFormatEqual(
|
||||
"'''Lorem ipsum''' dolor sit amet, consectetur adipiscing elit",
|
||||
"Lorem ipsum dolor sit amet, consectetur adipiscing elit")
|
||||
self.assertFormatEqual(
|
||||
"Test '''bold''' and ''italics'' and '''''both'''''.",
|
||||
"Test bold and italics and both.")
|
||||
|
||||
def test_format_wikilinks(self):
|
||||
self.assertFormatEqual(
|
||||
"Abc [[def ghi]]", "Abc def ghi")
|
||||
self.assertFormatEqual(
|
||||
"Abc [[def|custom title]] xyz", "Abc custom title xyz")
|
||||
# namespaced links get dropped
|
||||
self.assertFormatEqual(
|
||||
"hello world [[File:test.jpg]]", "hello world")
|
||||
self.assertFormatEqual(
|
||||
"[[Special:RecentChanges]] [[en:Test]]", "")
|
||||
|
||||
def test_format_images(self):
|
||||
self.assertFormatEqual(
|
||||
"[[File:Foo.png|foo]]\nsome text",
|
||||
"some text", summary=True)
|
||||
self.assertFormatEqual(
|
||||
"[[File:Foo.png|foo]]\n\nsome text again",
|
||||
"some text again")
|
||||
|
||||
# Adapted from https://en.wikipedia.org/wiki/Wikipedia:Extended_image_syntax#Examples
|
||||
self.assertFormatEqual("""text text text
|
||||
[[File:Westminstpalace.jpg|150px|alt=A large clock tower and other buildings line a great river.|The Palace of Westminster]]
|
||||
aa bb cc dd
|
||||
[[File:tst.png|100px|alt=Tiny globe|This is a globe.]]
|
||||
eee fff""", "text text text\n\naa bb cc dd\n\neee fff")
|
||||
self.assertFormatEqual("""[[File:Westminstpalace.jpg|150px|alt=A large clock tower and other buildings line a great river.|The Palace of Westminster]]
|
||||
aa bb cc dd
|
||||
[[File:tst.png|100px|alt=Tiny globe|This is a globe.]]
|
||||
eee fff""", "aa bb cc dd", summary=True)
|
||||
|
||||
def test_format_external_links(self):
|
||||
self.assertFormatEqual(
|
||||
"first [http://example.com] last", "first http://example.com last")
|
||||
self.assertFormatEqual(
|
||||
"first [http://example.com second] last", "first second last")
|
||||
|
||||
def test_format_templates(self):
|
||||
# Templates are ignored
|
||||
self.assertFormatEqual(
|
||||
"{{tmpl|arg=12345}}", "")
|
||||
self.assertFormatEqual(
|
||||
"{{tmpl2|foo=12345|bar=abcdefg}} test", "test")
|
||||
self.assertFormatEqual(
|
||||
"{{outer|{{inner test}}}}", "")
|
||||
# mwparserfromhell usage example
|
||||
self.assertFormatEqual(
|
||||
"{{cleanup}} '''Foo''' is a [[bar]]. {{uncategorized}}",
|
||||
"Foo is a bar.")
|
||||
|
||||
# multiline
|
||||
def test_multiline(self):
|
||||
self.assertFormatEqual(
|
||||
"Hello world.\n\nThis is the second line.",
|
||||
"Hello world.\n\nThis is the second line.")
|
||||
self.assertFormatEqual(
|
||||
"Hello world.\n\nThis is the second line.",
|
||||
"Hello world.", summary=True)
|
||||
self.assertFormatEqual(
|
||||
"This sentence is on one\nline.\n\n2nd line",
|
||||
"This sentence is on one\nline.", summary=True)
|
||||
|
||||
self.assertFormatEqual(
|
||||
"\n\n\n Leading spaces are dropped.\n\nThis is the second line.",
|
||||
"Leading spaces are dropped.\n\nThis is the second line.")
|
||||
self.assertFormatEqual(
|
||||
"\n\n\n Leading spaces are dropped.\n\nThis is the second line.",
|
||||
"Leading spaces are dropped.", summary=True)
|
||||
|
||||
def test_multiline_drop_empty_lines(self):
|
||||
# drop lines that are empty after filtering
|
||||
# e.g. Arch Linux Wiki pages with cross-language links
|
||||
self.assertFormatEqual(
|
||||
"[[Category:abcd]]\n[[de:Test]]\n[[en:Test]]\n[[zh:Test]]\n{{Related articles start}}\n"
|
||||
"Now the actual content starts\n1 2 3 4 5 6",
|
||||
"Now the actual content starts\n1 2 3 4 5 6", summary=True)
|
||||
self.assertFormatEqual(
|
||||
"[[Category:abcd]]\n\n {{Related articles start}} \n\n[[Help:abcdef]]\n\n"
|
||||
"Now the actual content starts\n\n1 2 3 4 5 6",
|
||||
"Now the actual content starts", summary=True)
|
||||
|
||||
def test_cleanup(self):
|
||||
# drop lines that are empty after filtering
|
||||
# e.g. Arch Linux Wiki pages with cross-language links
|
||||
empty_parens_after_filtering = """'''Vancouver''' ({{IPAc-en|audio=EN-Vancouver.ogg|v|æ|n|ˈ|k|uː|v|ər}}
|
||||
{{respell|van|KOO|vər}}) is a major city in [[western Canada]],"""
|
||||
self.assertFormatEqual(
|
||||
empty_parens_after_filtering,
|
||||
"Vancouver is a major city in western Canada,", summary=True)
|
||||
|
||||
if network:
|
||||
class Wikipedia(PluginTestCase):
|
||||
plugins = ('Wikifetch',)
|
||||
|
Loading…
x
Reference in New Issue
Block a user