mirror of
https://github.com/jlu5/SupyPlugins.git
synced 2025-04-26 13:01:07 -05:00
Wikifetch: add custom plain text translator by walking through Wikitext trees
This commit is contained in:
parent
32d9465b60
commit
a5b453baf3
@ -40,27 +40,23 @@ import supybot.world as world
|
|||||||
# in here if you're keeping the plugin in CVS or some similar system.
|
# in here if you're keeping the plugin in CVS or some similar system.
|
||||||
__version__ = "2022.02.01+git"
|
__version__ = "2022.02.01+git"
|
||||||
|
|
||||||
__author__ = supybot.Author('quantumlemur', 'quantumlemur',
|
__author__ = getattr(supybot.authors, 'jlu',
|
||||||
'quantumlemur@users.sourceforge.net')
|
|
||||||
__maintainer__ = getattr(supybot.authors, 'jlu',
|
|
||||||
supybot.Author('James Lu', 'jlu5', 'james@overdrivenetworks.com'))
|
supybot.Author('James Lu', 'jlu5', 'james@overdrivenetworks.com'))
|
||||||
|
|
||||||
# This is a dictionary mapping supybot.Author instances to lists of
|
# This is a dictionary mapping supybot.Author instances to lists of
|
||||||
# contributions.
|
# contributions.
|
||||||
__contributors__ = {supybot.authors.progval: ['enhance configurability',
|
__contributors__ = {}
|
||||||
'many bug fixes',
|
|
||||||
'internationalization'],
|
|
||||||
__maintainer__: ['formatting updates',
|
|
||||||
'multiple wiki support']}
|
|
||||||
|
|
||||||
__url__ = 'https://github.com/jlu5/SupyPlugins'
|
__url__ = 'https://github.com/jlu5/SupyPlugins'
|
||||||
|
|
||||||
from . import config
|
from . import config, formatter
|
||||||
|
# separate import line so that plugindownloader before 2022-06-23 doesn't run 2to3
|
||||||
from . import plugin
|
from . import plugin
|
||||||
from imp import reload
|
|
||||||
reload(plugin) # In case we're being reloaded.
|
from importlib import reload
|
||||||
# Add more reloads here if you add third-party modules and want them to be
|
reload(config)
|
||||||
# reloaded when this plugin is reloaded. Don't forget to import them as well!
|
reload(formatter)
|
||||||
|
reload(plugin)
|
||||||
|
|
||||||
if world.testing:
|
if world.testing:
|
||||||
from . import test
|
from . import test
|
||||||
|
57
Wikifetch/formatter.py
Normal file
57
Wikifetch/formatter.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import mwparserfromhell
|
||||||
|
|
||||||
|
def fmt_node(w, summary=False):
|
||||||
|
s = ''
|
||||||
|
for node in w.nodes:
|
||||||
|
if isinstance(node, mwparserfromhell.nodes.text.Text):
|
||||||
|
text = str(node)
|
||||||
|
elif isinstance(node, mwparserfromhell.nodes.tag.Tag):
|
||||||
|
text = fmt_node(node.contents, summary=summary)
|
||||||
|
elif isinstance(node, mwparserfromhell.nodes.wikilink.Wikilink):
|
||||||
|
text = node.text or node.title
|
||||||
|
if ':' in node.title:
|
||||||
|
continue
|
||||||
|
elif isinstance(node, mwparserfromhell.nodes.external_link.ExternalLink):
|
||||||
|
text = node.title or node.url
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if s or text.strip():
|
||||||
|
s += str(text)
|
||||||
|
if summary:
|
||||||
|
lines = s.lstrip().split('\n\n')
|
||||||
|
if len(lines) > 1:
|
||||||
|
s = lines[0]
|
||||||
|
break
|
||||||
|
return s.strip()
|
||||||
|
|
||||||
|
_RE_EMPTY_PARENTHESES = re.compile(r' ?\(\s+\)')
|
||||||
|
def _cleanup(text):
|
||||||
|
"""Attempt to clean up text a bit further."""
|
||||||
|
text = re.sub(_RE_EMPTY_PARENTHESES, '', text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
def fmt(text, clean=True, **kwargs):
|
||||||
|
w = mwparserfromhell.parse(text)
|
||||||
|
output = fmt_node(w, **kwargs)
|
||||||
|
if clean:
|
||||||
|
output = _cleanup(output)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Generate plain text summaries from Wikitext input")
|
||||||
|
parser.add_argument('--no-summary', '-ns', action='store_true',
|
||||||
|
help='Return the whole page instead of just the first paragraph')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
result = fmt(sys.stdin.read(), summary=not args.no_summary)
|
||||||
|
print(result)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -44,8 +44,9 @@ except:
|
|||||||
_ = lambda x:x
|
_ = lambda x:x
|
||||||
internationalizeDocstring = lambda x:x
|
internationalizeDocstring = lambda x:x
|
||||||
|
|
||||||
|
from . import formatter
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import mwparserfromhell
|
|
||||||
|
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
'User-agent': 'Mozilla/5.0 (compatible; Supybot/Limnoria %s; Wikifetch plugin)' % conf.version
|
'User-agent': 'Mozilla/5.0 (compatible; Supybot/Limnoria %s; Wikifetch plugin)' % conf.version
|
||||||
@ -61,7 +62,10 @@ class Wikifetch(callbacks.Plugin):
|
|||||||
|
|
||||||
self.log.debug('Wikifetch: fetching link %s', url)
|
self.log.debug('Wikifetch: fetching link %s', url)
|
||||||
with utils.web.getUrlFd(url, headers=HEADERS) as fd:
|
with utils.web.getUrlFd(url, headers=HEADERS) as fd:
|
||||||
|
try:
|
||||||
api_data = json.load(fd)
|
api_data = json.load(fd)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
raise callbacks.Error(f"JSON Decode Error on {url}: {e} - is this API URL correct?") from e
|
||||||
|
|
||||||
if isinstance(api_data, dict):
|
if isinstance(api_data, dict):
|
||||||
if error := api_data.get('error'):
|
if error := api_data.get('error'):
|
||||||
@ -86,26 +90,20 @@ class Wikifetch(callbacks.Plugin):
|
|||||||
page_title = api_data['parse']['title']
|
page_title = api_data['parse']['title']
|
||||||
content = api_data['parse']['wikitext']
|
content = api_data['parse']['wikitext']
|
||||||
html_head = api_data['parse']['headhtml']
|
html_head = api_data['parse']['headhtml']
|
||||||
mw = mwparserfromhell.parse(content)
|
text = formatter.fmt(content, summary=True)
|
||||||
for line in mw.strip_code().splitlines():
|
|
||||||
# Ignore stray image references that strip_code leaves behind
|
|
||||||
if re.search(r'\|?thumb\|', line):
|
|
||||||
continue
|
|
||||||
elif len(line) < 10:
|
|
||||||
continue
|
|
||||||
text = utils.str.normalizeWhitespace(line)
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
raise callbacks.Error(f"No text paragraph found for page {page_title!r}")
|
|
||||||
|
|
||||||
soup = BeautifulSoup(html_head, features="lxml")
|
soup = BeautifulSoup(html_head, features="lxml")
|
||||||
url = ''
|
|
||||||
if canonical_link := soup.find('link', rel='canonical'):
|
if canonical_link := soup.find('link', rel='canonical'):
|
||||||
# Wikipedia
|
# Wikipedia
|
||||||
url = canonical_link.attrs['href']
|
url = canonical_link.attrs['href']
|
||||||
elif og_url := soup.find('meta', property='og:url'):
|
elif og_url := soup.find('meta', property='og:url'):
|
||||||
# Fandom
|
# Fandom
|
||||||
url = og_url.attrs['content']
|
url = og_url.attrs['content']
|
||||||
|
else:
|
||||||
|
# Use generic MediaWiki link as fallback (this doesn't look as nice)
|
||||||
|
url = baseurl.replace('api.php', 'index.php?' + urllib.parse.urlencode({
|
||||||
|
'title': page_title
|
||||||
|
}))
|
||||||
|
|
||||||
return (text, url)
|
return (text, url)
|
||||||
|
|
||||||
|
@ -27,9 +27,120 @@
|
|||||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
# POSSIBILITY OF SUCH DAMAGE.
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
###
|
###
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from . import formatter
|
||||||
|
|
||||||
from supybot.test import *
|
from supybot.test import *
|
||||||
|
|
||||||
|
class WikifetchFormatterTest(unittest.TestCase):
|
||||||
|
def assertFormatEqual(self, wikitext, expected, **kwargs):
|
||||||
|
output = formatter.fmt(wikitext, **kwargs)
|
||||||
|
self.assertEqual(output, expected)
|
||||||
|
|
||||||
|
def test_basic(self):
|
||||||
|
self.assertFormatEqual('', '')
|
||||||
|
self.assertFormatEqual(
|
||||||
|
'The quick brown fox jumps over the lazy dog',
|
||||||
|
'The quick brown fox jumps over the lazy dog')
|
||||||
|
|
||||||
|
def test_format_tags(self):
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"'''Lorem ipsum''' dolor sit amet, consectetur adipiscing elit",
|
||||||
|
"Lorem ipsum dolor sit amet, consectetur adipiscing elit")
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"Test '''bold''' and ''italics'' and '''''both'''''.",
|
||||||
|
"Test bold and italics and both.")
|
||||||
|
|
||||||
|
def test_format_wikilinks(self):
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"Abc [[def ghi]]", "Abc def ghi")
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"Abc [[def|custom title]] xyz", "Abc custom title xyz")
|
||||||
|
# namespaced links get dropped
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"hello world [[File:test.jpg]]", "hello world")
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"[[Special:RecentChanges]] [[en:Test]]", "")
|
||||||
|
|
||||||
|
def test_format_images(self):
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"[[File:Foo.png|foo]]\nsome text",
|
||||||
|
"some text", summary=True)
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"[[File:Foo.png|foo]]\n\nsome text again",
|
||||||
|
"some text again")
|
||||||
|
|
||||||
|
# Adapted from https://en.wikipedia.org/wiki/Wikipedia:Extended_image_syntax#Examples
|
||||||
|
self.assertFormatEqual("""text text text
|
||||||
|
[[File:Westminstpalace.jpg|150px|alt=A large clock tower and other buildings line a great river.|The Palace of Westminster]]
|
||||||
|
aa bb cc dd
|
||||||
|
[[File:tst.png|100px|alt=Tiny globe|This is a globe.]]
|
||||||
|
eee fff""", "text text text\n\naa bb cc dd\n\neee fff")
|
||||||
|
self.assertFormatEqual("""[[File:Westminstpalace.jpg|150px|alt=A large clock tower and other buildings line a great river.|The Palace of Westminster]]
|
||||||
|
aa bb cc dd
|
||||||
|
[[File:tst.png|100px|alt=Tiny globe|This is a globe.]]
|
||||||
|
eee fff""", "aa bb cc dd", summary=True)
|
||||||
|
|
||||||
|
def test_format_external_links(self):
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"first [http://example.com] last", "first http://example.com last")
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"first [http://example.com second] last", "first second last")
|
||||||
|
|
||||||
|
def test_format_templates(self):
|
||||||
|
# Templates are ignored
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"{{tmpl|arg=12345}}", "")
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"{{tmpl2|foo=12345|bar=abcdefg}} test", "test")
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"{{outer|{{inner test}}}}", "")
|
||||||
|
# mwparserfromhell usage example
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"{{cleanup}} '''Foo''' is a [[bar]]. {{uncategorized}}",
|
||||||
|
"Foo is a bar.")
|
||||||
|
|
||||||
|
# multiline
|
||||||
|
def test_multiline(self):
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"Hello world.\n\nThis is the second line.",
|
||||||
|
"Hello world.\n\nThis is the second line.")
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"Hello world.\n\nThis is the second line.",
|
||||||
|
"Hello world.", summary=True)
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"This sentence is on one\nline.\n\n2nd line",
|
||||||
|
"This sentence is on one\nline.", summary=True)
|
||||||
|
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"\n\n\n Leading spaces are dropped.\n\nThis is the second line.",
|
||||||
|
"Leading spaces are dropped.\n\nThis is the second line.")
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"\n\n\n Leading spaces are dropped.\n\nThis is the second line.",
|
||||||
|
"Leading spaces are dropped.", summary=True)
|
||||||
|
|
||||||
|
def test_multiline_drop_empty_lines(self):
|
||||||
|
# drop lines that are empty after filtering
|
||||||
|
# e.g. Arch Linux Wiki pages with cross-language links
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"[[Category:abcd]]\n[[de:Test]]\n[[en:Test]]\n[[zh:Test]]\n{{Related articles start}}\n"
|
||||||
|
"Now the actual content starts\n1 2 3 4 5 6",
|
||||||
|
"Now the actual content starts\n1 2 3 4 5 6", summary=True)
|
||||||
|
self.assertFormatEqual(
|
||||||
|
"[[Category:abcd]]\n\n {{Related articles start}} \n\n[[Help:abcdef]]\n\n"
|
||||||
|
"Now the actual content starts\n\n1 2 3 4 5 6",
|
||||||
|
"Now the actual content starts", summary=True)
|
||||||
|
|
||||||
|
def test_cleanup(self):
|
||||||
|
# drop lines that are empty after filtering
|
||||||
|
# e.g. Arch Linux Wiki pages with cross-language links
|
||||||
|
empty_parens_after_filtering = """'''Vancouver''' ({{IPAc-en|audio=EN-Vancouver.ogg|v|æ|n|ˈ|k|uː|v|ər}}
|
||||||
|
{{respell|van|KOO|vər}}) is a major city in [[western Canada]],"""
|
||||||
|
self.assertFormatEqual(
|
||||||
|
empty_parens_after_filtering,
|
||||||
|
"Vancouver is a major city in western Canada,", summary=True)
|
||||||
|
|
||||||
if network:
|
if network:
|
||||||
class Wikipedia(PluginTestCase):
|
class Wikipedia(PluginTestCase):
|
||||||
plugins = ('Wikifetch',)
|
plugins = ('Wikifetch',)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user