Wikifetch: ignore GPS coordinates from articles for countries, etc.

This commit is contained in:
James Lu 2017-11-12 01:36:01 -08:00
parent d000140891
commit d147207ad1
2 changed files with 15 additions and 3 deletions

View File

@ -204,15 +204,20 @@ class Wikifetch(callbacks.Plugin):
reply += format(_('This article appears to be a talk page: %u'), addr)
else:
# Get the first paragraph as text.
p = text_content.xpath("./p[1]")
if len(p) == 0 or 'wiki/Special:Search' in addr:
paragraphs = []
for p in text_content.xpath("./p"):
# Skip geographic coordinates, e.g. on articles for countries
if not p.xpath(".//span[@class='geo-dec']"):
paragraphs.append(p)
if (not paragraphs) or 'wiki/Special:Search' in addr:
if 'wikipedia:wikiproject' in addr.lower():
reply += format(_('This page appears to be a WikiProject page, '
'but it is too complex for us to parse: %u'), addr)
else:
irc.error(_('Not found, or page malformed.'), Raise=True)
else:
p = p[0]
p = paragraphs[0]
# Replace <b> tags with IRC-style bold, this has to be
# done indirectly because unescaped '\x02' is invalid in XML
for b_tag in p.xpath('//b'):

View File

@ -45,6 +45,13 @@ if network:
def testStripInlineCitations(self):
self.assertNotRegexp('wiki UNICEF', '\[\d+\]')
def testIgnoreCoordinates(self):
# Articles for countries, cities, landmarks, etc. have GPS coordinates added to the top right.
# These should be ignored because we want to focus on the actual article text.
self.assertNotRegexp('wiki Canada', 'Coordinates\:')
self.assertNotRegexp('wiki Eiffel Tower', 'Coordinates\:')
self.assertNotRegexp('wiki Poland', 'Coordinates\:')
def testDisambig(self):
self.assertRegexp('wiki Python', 'is a disambiguation page.*'
'Possible results include:.*?,.*?,')