mirror of
https://github.com/jlu5/SupyPlugins.git
synced 2025-04-27 05:21:10 -05:00
Wikifetch: add a --no-mw-parsing option in an attempt to support non-MediaWiki sites
This commit is contained in:
parent
194ac4d7be
commit
819fcc6c09
@ -65,20 +65,20 @@ class Wikifetch(callbacks.Plugin):
|
|||||||
'wikimedia.org': '/wiki',
|
'wikimedia.org': '/wiki',
|
||||||
}
|
}
|
||||||
|
|
||||||
def _get_article_tree(self, baseurl, search):
|
def _get_article_tree(self, baseurl, query, use_mw_parsing=True):
|
||||||
"""
|
"""
|
||||||
Returns the article tree given the base URL and search query. baseurl can be None,
|
Returns a wiki article tree given the base URL and search query. baseurl can be None,
|
||||||
in which case, searching is skipped and the search query will be treated as a raw address.
|
in which case, searching is skipped and the search query will be treated as a raw address.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if baseurl is None:
|
if baseurl is None:
|
||||||
addr = search
|
addr = query
|
||||||
else:
|
else:
|
||||||
# Different instances of MediaWiki use different URLs... This tries
|
# Different instances of MediaWiki use different URLs... This tries
|
||||||
# to make the parser work for most sites, but still use resonable defaults
|
# to make the parser work for most sites, but still use resonable defaults
|
||||||
# such as filling in http:// and appending /wiki to links...
|
# such as filling in http:// and appending /wiki to links...
|
||||||
# Special cases: Wikia, Wikipedia, Wikimedia (i.e. Wikimedia Commons), Arch Linux Wiki
|
# Special cases: Wikia, Wikipedia, Wikimedia (i.e. Wikimedia Commons), Arch Linux Wiki
|
||||||
if '/' not in search:
|
if '/' not in query:
|
||||||
baseurl = baseurl.lower()
|
baseurl = baseurl.lower()
|
||||||
for match, suffix in self.SPECIAL_URLS.items():
|
for match, suffix in self.SPECIAL_URLS.items():
|
||||||
if match in baseurl:
|
if match in baseurl:
|
||||||
@ -89,9 +89,12 @@ class Wikifetch(callbacks.Plugin):
|
|||||||
if not baseurl.startswith(('http://', 'https://')):
|
if not baseurl.startswith(('http://', 'https://')):
|
||||||
baseurl = 'http://' + baseurl
|
baseurl = 'http://' + baseurl
|
||||||
|
|
||||||
# first, we get the page
|
if use_mw_parsing:
|
||||||
addr = '%s/Special:Search?search=%s' % \
|
# first, we get the page
|
||||||
(baseurl, quote_plus(search))
|
addr = '%s/Special:Search?search=%s' % \
|
||||||
|
(baseurl, quote_plus(query))
|
||||||
|
else:
|
||||||
|
addr = '%s/%s' % (baseurl, query)
|
||||||
|
|
||||||
self.log.debug('Wikifetch: using URL %s', addr)
|
self.log.debug('Wikifetch: using URL %s', addr)
|
||||||
|
|
||||||
@ -107,12 +110,12 @@ class Wikifetch(callbacks.Plugin):
|
|||||||
tree = lxml.html.document_fromstring(article)
|
tree = lxml.html.document_fromstring(article)
|
||||||
return (tree, article, addr)
|
return (tree, article, addr)
|
||||||
|
|
||||||
def _wiki(self, irc, msg, search, baseurl):
|
def _wiki(self, irc, msg, search, baseurl, use_mw_parsing=True):
|
||||||
"""Fetches and replies content from a MediaWiki-powered website."""
|
"""Fetches and replies content from a MediaWiki-powered website."""
|
||||||
reply = ''
|
reply = ''
|
||||||
|
|
||||||
# First, fetch and parse the page
|
# First, fetch and parse the page
|
||||||
tree, article, addr = self._get_article_tree(baseurl, search)
|
tree, article, addr = self._get_article_tree(baseurl, search, use_mw_parsing=use_mw_parsing)
|
||||||
|
|
||||||
# check if it gives a "Did you mean..." redirect
|
# check if it gives a "Did you mean..." redirect
|
||||||
didyoumean = tree.xpath('//div[@class="searchdidyoumean"]/a'
|
didyoumean = tree.xpath('//div[@class="searchdidyoumean"]/a'
|
||||||
@ -200,7 +203,10 @@ class Wikifetch(callbacks.Plugin):
|
|||||||
elif 'ns-talk' in tree.find("body").attrib.get('class', ''):
|
elif 'ns-talk' in tree.find("body").attrib.get('class', ''):
|
||||||
reply += format(_('This article appears to be a talk page: %u'), addr)
|
reply += format(_('This article appears to be a talk page: %u'), addr)
|
||||||
else:
|
else:
|
||||||
p = tree.xpath("//div[@id='mw-content-text']/p[1]")
|
if use_mw_parsing:
|
||||||
|
p = tree.xpath("//div[@id='mw-content-text']/p[1]")
|
||||||
|
else: # Disable looking for MediaWiki-specific tags if searching is off
|
||||||
|
p = tree.xpath("//p[1]")
|
||||||
if len(p) == 0 or 'wiki/Special:Search' in addr:
|
if len(p) == 0 or 'wiki/Special:Search' in addr:
|
||||||
if 'wikipedia:wikiproject' in addr.lower():
|
if 'wikipedia:wikiproject' in addr.lower():
|
||||||
reply += format(_('This page appears to be a WikiProject page, '
|
reply += format(_('This page appears to be a WikiProject page, '
|
||||||
@ -236,16 +242,31 @@ class Wikifetch(callbacks.Plugin):
|
|||||||
return reply
|
return reply
|
||||||
|
|
||||||
@internationalizeDocstring
|
@internationalizeDocstring
|
||||||
@wrap([getopts({'site': 'somethingWithoutSpaces'}), 'text'])
|
@wrap([getopts({'site': 'somethingWithoutSpaces',
|
||||||
|
'no-mw-parsing': ''}),
|
||||||
|
'text'])
|
||||||
def wiki(self, irc, msg, args, optlist, search):
|
def wiki(self, irc, msg, args, optlist, search):
|
||||||
"""[--site <site>] <search term>
|
"""[--site <site>] [--no-mw-parsing] <search term>
|
||||||
|
|
||||||
Returns the first paragraph of a wiki article. Optionally, a --site
|
Returns the first paragraph of a wiki article. Optionally, a --site
|
||||||
argument can be given to override the default (usually Wikipedia) -
|
argument can be given to override the default (usually Wikipedia) -
|
||||||
try using '--site lyrics.wikia.com' or '--site wiki.archlinux.org'."""
|
try using '--site lyrics.wikia.com' or '--site wiki.archlinux.org'.
|
||||||
|
|
||||||
|
If the --no-mw-parsing option is given, MediaWiki-specific parsing is
|
||||||
|
disabled. This has the following effects:
|
||||||
|
1) No attempt at searching for a relevant Wiki page is made, and
|
||||||
|
an article with the same name as the search term is directly
|
||||||
|
retrieved.
|
||||||
|
2) The plugin will retrieve the first <p> tag found on a page,
|
||||||
|
regardless of where it's found, and print it as text. This may
|
||||||
|
not work on all sites, as some use <p> for navbars and headings
|
||||||
|
as well.
|
||||||
|
"""
|
||||||
optlist = dict(optlist)
|
optlist = dict(optlist)
|
||||||
baseurl = optlist.get('site') or self.registryValue('url', msg.args[0])
|
baseurl = optlist.get('site') or self.registryValue('url', msg.args[0])
|
||||||
text = self._wiki(irc, msg, search, baseurl)
|
text = self._wiki(irc, msg, search, baseurl,
|
||||||
|
use_mw_parsing=not optlist.get('no-mw-parsing'),
|
||||||
|
)
|
||||||
|
|
||||||
irc.reply(text)
|
irc.reply(text)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user