From d147207ad167cb50fec1d464fb4b8ffdc6776da9 Mon Sep 17 00:00:00 2001 From: James Lu Date: Sun, 12 Nov 2017 01:36:01 -0800 Subject: [PATCH] Wikifetch: ignore GPS coordinates from articles for countries, etc. --- Wikifetch/plugin.py | 11 ++++++++--- Wikifetch/test.py | 7 +++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/Wikifetch/plugin.py b/Wikifetch/plugin.py index 6f7d9a2..2a59946 100644 --- a/Wikifetch/plugin.py +++ b/Wikifetch/plugin.py @@ -204,15 +204,20 @@ class Wikifetch(callbacks.Plugin): reply += format(_('This article appears to be a talk page: %u'), addr) else: # Get the first paragraph as text. - p = text_content.xpath("./p[1]") - if len(p) == 0 or 'wiki/Special:Search' in addr: + paragraphs = [] + for p in text_content.xpath("./p"): + # Skip geographic coordinates, e.g. on articles for countries + if not p.xpath(".//span[@class='geo-dec']"): + paragraphs.append(p) + + if (not paragraphs) or 'wiki/Special:Search' in addr: if 'wikipedia:wikiproject' in addr.lower(): reply += format(_('This page appears to be a WikiProject page, ' 'but it is too complex for us to parse: %u'), addr) else: irc.error(_('Not found, or page malformed.'), Raise=True) else: - p = p[0] + p = paragraphs[0] # Replace tags with IRC-style bold, this has to be # done indirectly because unescaped '\x02' is invalid in XML for b_tag in p.xpath('//b'): diff --git a/Wikifetch/test.py b/Wikifetch/test.py index 76d9c38..d3a272f 100644 --- a/Wikifetch/test.py +++ b/Wikifetch/test.py @@ -45,6 +45,13 @@ if network: def testStripInlineCitations(self): self.assertNotRegexp('wiki UNICEF', '\[\d+\]') + def testIgnoreCoordinates(self): + # Articles for countries, cities, landmarks, etc. have GPS coordinates added to the top right. + # These should be ignored because we want to focus on the actual article text. + self.assertNotRegexp('wiki Canada', 'Coordinates\:') + self.assertNotRegexp('wiki Eiffel Tower', 'Coordinates\:') + self.assertNotRegexp('wiki Poland', 'Coordinates\:') + def testDisambig(self): self.assertRegexp('wiki Python', 'is a disambiguation page.*' 'Possible results include:.*?,.*?,')