Wikifetch: ignore GPS coordinates from articles for countries, etc.

This commit is contained in:
James Lu 2017-11-12 01:36:01 -08:00
parent d000140891
commit d147207ad1
2 changed files with 15 additions and 3 deletions

View File

@ -204,15 +204,20 @@ class Wikifetch(callbacks.Plugin):
reply += format(_('This article appears to be a talk page: %u'), addr) reply += format(_('This article appears to be a talk page: %u'), addr)
else: else:
# Get the first paragraph as text. # Get the first paragraph as text.
p = text_content.xpath("./p[1]") paragraphs = []
if len(p) == 0 or 'wiki/Special:Search' in addr: for p in text_content.xpath("./p"):
# Skip geographic coordinates, e.g. on articles for countries
if not p.xpath(".//span[@class='geo-dec']"):
paragraphs.append(p)
if (not paragraphs) or 'wiki/Special:Search' in addr:
if 'wikipedia:wikiproject' in addr.lower(): if 'wikipedia:wikiproject' in addr.lower():
reply += format(_('This page appears to be a WikiProject page, ' reply += format(_('This page appears to be a WikiProject page, '
'but it is too complex for us to parse: %u'), addr) 'but it is too complex for us to parse: %u'), addr)
else: else:
irc.error(_('Not found, or page malformed.'), Raise=True) irc.error(_('Not found, or page malformed.'), Raise=True)
else: else:
p = p[0] p = paragraphs[0]
# Replace <b> tags with IRC-style bold, this has to be # Replace <b> tags with IRC-style bold, this has to be
# done indirectly because unescaped '\x02' is invalid in XML # done indirectly because unescaped '\x02' is invalid in XML
for b_tag in p.xpath('//b'): for b_tag in p.xpath('//b'):

View File

@ -45,6 +45,13 @@ if network:
def testStripInlineCitations(self): def testStripInlineCitations(self):
self.assertNotRegexp('wiki UNICEF', '\[\d+\]') self.assertNotRegexp('wiki UNICEF', '\[\d+\]')
def testIgnoreCoordinates(self):
# Articles for countries, cities, landmarks, etc. have GPS coordinates added to the top right.
# These should be ignored because we want to focus on the actual article text.
self.assertNotRegexp('wiki Canada', 'Coordinates\:')
self.assertNotRegexp('wiki Eiffel Tower', 'Coordinates\:')
self.assertNotRegexp('wiki Poland', 'Coordinates\:')
def testDisambig(self): def testDisambig(self):
self.assertRegexp('wiki Python', 'is a disambiguation page.*' self.assertRegexp('wiki Python', 'is a disambiguation page.*'
'Possible results include:.*?,.*?,') 'Possible results include:.*?,.*?,')