#!/usr/bin/env python ### # Copyright (c) 2002, Jeremiah Fincher # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions, and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions, and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of the author of this software nor the name of # contributors to this software may be used to endorse or promote products # derived from this software without specific prior written consent. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. ### """ Simple utility functions. """ from __future__ import generators from fix import * import sgmllib import htmlentitydefs class HtmlToText(sgmllib.SGMLParser): """Taken from some eff-bot code on c.l.p.""" entitydefs = htmlentitydefs.entitydefs def __init__(self, tagReplace=' '): self.data = [] self.tagReplace = tagReplace sgmllib.SGMLParser.__init__(self) def unknown_starttag(self, tag, attrib): self.data.append(self.tagReplace) def unknown_endtag(self, tag): self.data.append(self.tagReplace) def handle_data(self, data): self.data.append(data) def getText(self): text = ''.join(self.data).strip() return ' '.join(text.split()) # normalize whitespace def htmlToText(s, tagReplace=' '): x = HtmlToText(tagReplace) x.feed(s) return x.getText() def eachSubstring(s): for i in range(1, len(s)+1): yield s[:i] def abbrev(strings): d = {} for s in strings: for abbreviation in eachSubstring(s): if abbreviation not in d: d[abbreviation] = s else: d[abbreviation] = None removals = [] for key in d: if d[key] is None: removals.append(key) for key in removals: del d[key] return d # vim:set shiftwidth=4 tabstop=8 expandtab textwidth=78: