From 08d8f48db5ad623a372eadb1b65b2cdfebb59f77 Mon Sep 17 00:00:00 2001 From: James Lu Date: Fri, 1 Sep 2017 18:04:20 -0700 Subject: [PATCH] Wikifetch: refactor text fetching, fix listing disambig results --- Wikifetch/plugin.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/Wikifetch/plugin.py b/Wikifetch/plugin.py index b95a5e9..707af2b 100644 --- a/Wikifetch/plugin.py +++ b/Wikifetch/plugin.py @@ -180,13 +180,20 @@ class Wikifetch(callbacks.Plugin): # If any of the above post-processing tricks fail, just ignore pass + text_content = tree + if use_mw_parsing: + text_content = tree.xpath("//div[@class='mw-parser-output']") or tree.xpath("//div[@id='mw-content-text']") + if text_content: + text_content = text_content[0] + self.log.debug('Wikifetch: Using %s as text_content', text_content) + # check if it's a disambiguation page disambig = tree.xpath('//table[@id="disambigbox"]') or \ tree.xpath('//table[@id="setindexbox"]') or \ tree.xpath('//div[contains(@class, "disambig")]') # Wikia (2017-01-27) if disambig: reply += format(_('%u is a disambiguation page. '), addr) - disambig = tree.xpath('//div[@id="bodyContent"]/div/ul/li') + disambig = text_content.xpath('./ul/li') disambig_results = [] for item in disambig: @@ -206,12 +213,8 @@ class Wikifetch(callbacks.Plugin): elif 'ns-talk' in tree.find("body").attrib.get('class', ''): reply += format(_('This article appears to be a talk page: %u'), addr) else: - if use_mw_parsing: - # As of 2017-06-03, Wikipedia has put its text content under a new "mw-parser-output" div, while - # other sites (e.g. Wikia) still have it directly under "mw-content-text". - p = tree.xpath("//div[@id='mw-content-text']/p[1]") or tree.xpath("//div[@class='mw-parser-output']/p[1]") - else: # Don't look for MediaWiki-specific tags if MediaWiki parsing is disabled - p = tree.xpath("//p[1]") + # Get the first paragraph as text. + p = text_content.xpath("./p[1]") if len(p) == 0 or 'wiki/Special:Search' in addr: if 'wikipedia:wikiproject' in addr.lower(): reply += format(_('This page appears to be a WikiProject page, '