mirror of
https://github.com/jlu5/SupyPlugins.git
synced 2025-04-26 04:51:08 -05:00
Wikifetch: add a --no-mw-parsing option in an attempt to support non-MediaWiki sites
This commit is contained in:
parent
194ac4d7be
commit
819fcc6c09
@ -65,20 +65,20 @@ class Wikifetch(callbacks.Plugin):
|
||||
'wikimedia.org': '/wiki',
|
||||
}
|
||||
|
||||
def _get_article_tree(self, baseurl, search):
|
||||
def _get_article_tree(self, baseurl, query, use_mw_parsing=True):
|
||||
"""
|
||||
Returns the article tree given the base URL and search query. baseurl can be None,
|
||||
Returns a wiki article tree given the base URL and search query. baseurl can be None,
|
||||
in which case, searching is skipped and the search query will be treated as a raw address.
|
||||
"""
|
||||
|
||||
if baseurl is None:
|
||||
addr = search
|
||||
addr = query
|
||||
else:
|
||||
# Different instances of MediaWiki use different URLs... This tries
|
||||
# to make the parser work for most sites, but still use resonable defaults
|
||||
# such as filling in http:// and appending /wiki to links...
|
||||
# Special cases: Wikia, Wikipedia, Wikimedia (i.e. Wikimedia Commons), Arch Linux Wiki
|
||||
if '/' not in search:
|
||||
if '/' not in query:
|
||||
baseurl = baseurl.lower()
|
||||
for match, suffix in self.SPECIAL_URLS.items():
|
||||
if match in baseurl:
|
||||
@ -89,9 +89,12 @@ class Wikifetch(callbacks.Plugin):
|
||||
if not baseurl.startswith(('http://', 'https://')):
|
||||
baseurl = 'http://' + baseurl
|
||||
|
||||
# first, we get the page
|
||||
addr = '%s/Special:Search?search=%s' % \
|
||||
(baseurl, quote_plus(search))
|
||||
if use_mw_parsing:
|
||||
# first, we get the page
|
||||
addr = '%s/Special:Search?search=%s' % \
|
||||
(baseurl, quote_plus(query))
|
||||
else:
|
||||
addr = '%s/%s' % (baseurl, query)
|
||||
|
||||
self.log.debug('Wikifetch: using URL %s', addr)
|
||||
|
||||
@ -107,12 +110,12 @@ class Wikifetch(callbacks.Plugin):
|
||||
tree = lxml.html.document_fromstring(article)
|
||||
return (tree, article, addr)
|
||||
|
||||
def _wiki(self, irc, msg, search, baseurl):
|
||||
def _wiki(self, irc, msg, search, baseurl, use_mw_parsing=True):
|
||||
"""Fetches and replies content from a MediaWiki-powered website."""
|
||||
reply = ''
|
||||
|
||||
# First, fetch and parse the page
|
||||
tree, article, addr = self._get_article_tree(baseurl, search)
|
||||
tree, article, addr = self._get_article_tree(baseurl, search, use_mw_parsing=use_mw_parsing)
|
||||
|
||||
# check if it gives a "Did you mean..." redirect
|
||||
didyoumean = tree.xpath('//div[@class="searchdidyoumean"]/a'
|
||||
@ -200,7 +203,10 @@ class Wikifetch(callbacks.Plugin):
|
||||
elif 'ns-talk' in tree.find("body").attrib.get('class', ''):
|
||||
reply += format(_('This article appears to be a talk page: %u'), addr)
|
||||
else:
|
||||
p = tree.xpath("//div[@id='mw-content-text']/p[1]")
|
||||
if use_mw_parsing:
|
||||
p = tree.xpath("//div[@id='mw-content-text']/p[1]")
|
||||
else: # Disable looking for MediaWiki-specific tags if searching is off
|
||||
p = tree.xpath("//p[1]")
|
||||
if len(p) == 0 or 'wiki/Special:Search' in addr:
|
||||
if 'wikipedia:wikiproject' in addr.lower():
|
||||
reply += format(_('This page appears to be a WikiProject page, '
|
||||
@ -236,16 +242,31 @@ class Wikifetch(callbacks.Plugin):
|
||||
return reply
|
||||
|
||||
@internationalizeDocstring
|
||||
@wrap([getopts({'site': 'somethingWithoutSpaces'}), 'text'])
|
||||
@wrap([getopts({'site': 'somethingWithoutSpaces',
|
||||
'no-mw-parsing': ''}),
|
||||
'text'])
|
||||
def wiki(self, irc, msg, args, optlist, search):
|
||||
"""[--site <site>] <search term>
|
||||
"""[--site <site>] [--no-mw-parsing] <search term>
|
||||
|
||||
Returns the first paragraph of a wiki article. Optionally, a --site
|
||||
argument can be given to override the default (usually Wikipedia) -
|
||||
try using '--site lyrics.wikia.com' or '--site wiki.archlinux.org'."""
|
||||
try using '--site lyrics.wikia.com' or '--site wiki.archlinux.org'.
|
||||
|
||||
If the --no-mw-parsing option is given, MediaWiki-specific parsing is
|
||||
disabled. This has the following effects:
|
||||
1) No attempt at searching for a relevant Wiki page is made, and
|
||||
an article with the same name as the search term is directly
|
||||
retrieved.
|
||||
2) The plugin will retrieve the first <p> tag found on a page,
|
||||
regardless of where it's found, and print it as text. This may
|
||||
not work on all sites, as some use <p> for navbars and headings
|
||||
as well.
|
||||
"""
|
||||
optlist = dict(optlist)
|
||||
baseurl = optlist.get('site') or self.registryValue('url', msg.args[0])
|
||||
text = self._wiki(irc, msg, search, baseurl)
|
||||
text = self._wiki(irc, msg, search, baseurl,
|
||||
use_mw_parsing=not optlist.get('no-mw-parsing'),
|
||||
)
|
||||
|
||||
irc.reply(text)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user