#!/usr/bin/env python3 import argparse import re import sys import mwparserfromhell def fmt_node(w, summary=False): s = '' for node in w.nodes: if isinstance(node, mwparserfromhell.nodes.text.Text): text = str(node) elif isinstance(node, mwparserfromhell.nodes.tag.Tag): text = fmt_node(node.contents, summary=summary) elif isinstance(node, mwparserfromhell.nodes.wikilink.Wikilink): text = node.text or node.title if ':' in node.title: continue elif isinstance(node, mwparserfromhell.nodes.external_link.ExternalLink): text = node.title or node.url else: continue if s or text.strip(): s += str(text) if summary: lines = s.lstrip().split('\n\n') if len(lines) > 1: s = lines[0] break # replace \n with commas, it's often used inside bulleted lists return s.strip().replace('\n', ', ') _RE_EMPTY_PARENTHESES = re.compile(r' ?\(\s+\)') def _cleanup(text): """Attempt to clean up text a bit further.""" text = re.sub(_RE_EMPTY_PARENTHESES, '', text) return text def fmt(text, clean=True, **kwargs): w = mwparserfromhell.parse(text) output = fmt_node(w, **kwargs) if clean: output = _cleanup(output) return output def main(): parser = argparse.ArgumentParser( description="Generate plain text summaries from Wikitext input") parser.add_argument('--no-summary', '-ns', action='store_true', help='Return the whole page instead of just the first paragraph') args = parser.parse_args() result = fmt(sys.stdin.read(), summary=not args.no_summary) print(result) if __name__ == '__main__': main()