From 625043352af4b78902ec5acf701dd10fe6167d58 Mon Sep 17 00:00:00 2001 From: Vasily Zubarev Date: Tue, 7 Jan 2020 17:53:24 +0100 Subject: [PATCH] Strip html from titles --- scripts/update.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/scripts/update.py b/scripts/update.py index 6508f61..b73f5ce 100644 --- a/scripts/update.py +++ b/scripts/update.py @@ -106,7 +106,10 @@ def refresh_feed(item): print(f"Updating feed {item['name']}...") feed = feedparser.parse(item['rss']) for entry in feed.entries[:DEFAULT_ENTRIES_LIMIT]: - entry_title = entry.get("title") or entry.get("description") or entry.get("summary") + entry_title = parse_title(entry) + if not entry_title: + continue + print(f"- article: '{entry_title}' {entry.link}") article, is_created = Article.objects.get_or_create( board_id=item["board_id"], @@ -132,7 +135,7 @@ def refresh_feed(item): article.url = real_url[:2000] article.domain = parse_domain(real_url)[:256] - text, lead_image = parse_rss_entry(entry) + text, lead_image = parse_text_and_image(entry) if text: article.description = text[:1000] @@ -199,7 +202,12 @@ def parse_datetime(entry): return datetime.utcnow() -def parse_rss_entry(entry): +def parse_title(entry): + title = entry.get("title") or entry.get("description") or entry.get("summary") + return re.sub("<[^<]+?>", "", title).strip() + + +def parse_text_and_image(entry): bs = BeautifulSoup(entry.summary, features="lxml") text = re.sub(r"\s\s+", " ", bs.text or "").strip()