Wikifetch: skip looking at empty leading paragraphs

This commit is contained in:
James Lu 2018-07-19 18:04:56 +00:00
parent 79559d48f1
commit 9221d87c29

View File

@ -206,8 +206,18 @@ class Wikifetch(callbacks.Plugin):
# Get the first paragraph as text.
paragraphs = []
for p in text_content.xpath("./p"):
self.log.debug('Wikifetch: looking at paragraph %s', p.text_content())
# Skip geographic coordinates, e.g. on articles for countries
if not p.xpath(".//span[@class='geo-dec']"):
if p.xpath(".//span[@class='geo-dec']"):
continue
# 2018-07-19: some articles have an empty p tag with this class and no content (why?)
elif 'mw-empty-elt' in p.attrib.get('class', ''):
continue
# Skip <p> tags with no content, for obvious reasons
elif not p.text_content().strip():
continue
paragraphs.append(p)
if (not paragraphs) or 'wiki/Special:Search' in addr: