From 9221d87c29bc9c6a91a4cbe1ece9ce23c211e93a Mon Sep 17 00:00:00 2001 From: James Lu Date: Thu, 19 Jul 2018 18:04:56 +0000 Subject: [PATCH] Wikifetch: skip looking at empty leading paragraphs --- Wikifetch/plugin.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/Wikifetch/plugin.py b/Wikifetch/plugin.py index 7b25095..fd5a9bd 100644 --- a/Wikifetch/plugin.py +++ b/Wikifetch/plugin.py @@ -206,9 +206,19 @@ class Wikifetch(callbacks.Plugin): # Get the first paragraph as text. paragraphs = [] for p in text_content.xpath("./p"): + self.log.debug('Wikifetch: looking at paragraph %s', p.text_content()) + # Skip geographic coordinates, e.g. on articles for countries - if not p.xpath(".//span[@class='geo-dec']"): - paragraphs.append(p) + if p.xpath(".//span[@class='geo-dec']"): + continue + # 2018-07-19: some articles have an empty p tag with this class and no content (why?) + elif 'mw-empty-elt' in p.attrib.get('class', ''): + continue + # Skip

tags with no content, for obvious reasons + elif not p.text_content().strip(): + continue + + paragraphs.append(p) if (not paragraphs) or 'wiki/Special:Search' in addr: if 'wikipedia:wikiproject' in addr.lower():