Wikifetch: skip looking at empty leading paragraphs

This commit is contained in:
James Lu 2018-07-19 18:04:56 +00:00
parent 79559d48f1
commit 9221d87c29

View File

@ -206,9 +206,19 @@ class Wikifetch(callbacks.Plugin):
# Get the first paragraph as text. # Get the first paragraph as text.
paragraphs = [] paragraphs = []
for p in text_content.xpath("./p"): for p in text_content.xpath("./p"):
self.log.debug('Wikifetch: looking at paragraph %s', p.text_content())
# Skip geographic coordinates, e.g. on articles for countries # Skip geographic coordinates, e.g. on articles for countries
if not p.xpath(".//span[@class='geo-dec']"): if p.xpath(".//span[@class='geo-dec']"):
paragraphs.append(p) continue
# 2018-07-19: some articles have an empty p tag with this class and no content (why?)
elif 'mw-empty-elt' in p.attrib.get('class', ''):
continue
# Skip <p> tags with no content, for obvious reasons
elif not p.text_content().strip():
continue
paragraphs.append(p)
if (not paragraphs) or 'wiki/Special:Search' in addr: if (not paragraphs) or 'wiki/Special:Search' in addr:
if 'wikipedia:wikiproject' in addr.lower(): if 'wikipedia:wikiproject' in addr.lower():