From 3ecd8a6db70434a8bad3f47c6ab61d33b1b39b29 Mon Sep 17 00:00:00 2001 From: oddluck <39967334+oddluck@users.noreply.github.com> Date: Sat, 7 Mar 2020 02:05:08 +0000 Subject: [PATCH] Jeopardy: better text reformatting --- Jeopardy/plugin.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Jeopardy/plugin.py b/Jeopardy/plugin.py index 739bfdc..60e15ad 100644 --- a/Jeopardy/plugin.py +++ b/Jeopardy/plugin.py @@ -276,14 +276,15 @@ class Jeopardy(callbacks.Plugin): def normalize(self, q): - q = re.sub('<[^<]+?>', '', fix_text(q, normalization='NFKC')).replace(r"\'", "'").replace(r'\"', '"') + q = BeautifulSoup(q) + q = fix_text(q.text).replace(r"\'", "'").replace(r'\"', '"') q = re.sub('([,;:.!?)])([a-zA-Z]|\()(?![.\'])', '\g<1> \g<2>', q) - q = unidecode(q) q = " ".join(q.split()) return q def clean(self, text): + text = unidecode(text) if len(text) > 2: text = re.sub('[^a-zA-Z0-9 ]+', '', text) text = re.sub('^a |^an |^the |^or ', '', text).replace(' ', '')