From 95da927828f30a68fa4dbe9bd6bd53b3b5fe858e Mon Sep 17 00:00:00 2001 From: Vasily Zubarev Date: Tue, 18 Feb 2020 15:32:35 +0100 Subject: [PATCH] Add telegram message cleanup --- parsing/telegram/cleanup.py | 16 ++++++++++++++ parsing/telegram/models.py | 42 ++++++++++--------------------------- parsing/telegram/parsers.py | 34 +++++++++--------------------- 3 files changed, 37 insertions(+), 55 deletions(-) create mode 100644 parsing/telegram/cleanup.py diff --git a/parsing/telegram/cleanup.py b/parsing/telegram/cleanup.py new file mode 100644 index 0000000..b3b68cd --- /dev/null +++ b/parsing/telegram/cleanup.py @@ -0,0 +1,16 @@ +import re + + +def cleanup_telegram_message_text(text): + text = str(text) + cleanup_expressions = [ + r"\[.*?\]\(.*?\)", # attached files and images + r"#[\S]*", # hashtags + r"[\*\~\`]+", # stars and tildas + r"<[^>]*>", # html tags + r"[\n]{3,}", # triple newlines + ] + for pattern in cleanup_expressions: + text = re.sub(pattern, "", text) + + return text.strip() diff --git a/parsing/telegram/models.py b/parsing/telegram/models.py index 7e3c7ce..1bffe97 100644 --- a/parsing/telegram/models.py +++ b/parsing/telegram/models.py @@ -3,28 +3,18 @@ from enum import Enum class TelegramChannel: def __init__( - self, channel_id=None, title=None, link=None, description=None, messages=None + self, + channel_id=None, + title=None, + link=None, + description=None, + messages=None ): self.channel_id = channel_id self.title = title self.link = link self.description = description - self.messages = messages if messages is not None else [] - - def add_message(self, message): - self.messages.append(message) - - def remove_message(self, message): - self.messages.remove(message) - - def to_dict(self): - return { - "channel_id": self.channel_id, - "title": self.title, - "link": self.link, - "description": self.description, - "messages": list(map(lambda message: message.to_dict(), self.messages)), - } + self.messages = messages or [] class TelegramChannelMessage: @@ -32,7 +22,8 @@ class TelegramChannelMessage: self, telegram_id=None, title=None, - description=None, + text=None, + clean_text=None, link=None, channel=None, grouped_id=None, @@ -41,25 +32,14 @@ class TelegramChannelMessage: ): self.telegram_id = telegram_id self.title = title - self.description = description + self.text = text + self.clean_text = clean_text self.link = link self.channel = channel self.grouped_id = grouped_id self.type = type self.timestamp = timestamp - def to_dict(self): - return { - "telegram_id": self.telegram_id, - "title": self.title, - "description": self.description, - "link": self.link, - "channel": self.channel.channel_id, - "grouped_id": self.grouped_id, - "type": self.type.value, - "timestamp": self.timestamp.strftime("%Y-%m-%d %H:%M:%S"), - } - class MessageType(Enum): TEXT = "text" diff --git a/parsing/telegram/parsers.py b/parsing/telegram/parsers.py index 80793bc..a120b3d 100644 --- a/parsing/telegram/parsers.py +++ b/parsing/telegram/parsers.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod + +from parsing.telegram.cleanup import cleanup_telegram_message_text from parsing.telegram.models import TelegramChannelMessage, TelegramChannel, MessageType -import re class Parser(ABC): @@ -26,34 +27,19 @@ class Parser(ABC): class SimpleTextParser(Parser): - MARKDOWN_BOLD = ".+\*.+\*.+" - SIMPLE_TEXT_POST = "(.+)\n+(.+)" - def parse(self, channel, message): parsed_message = super().parse(channel, message) - (title, description) = self.parse_text(message) - parsed_message.title = self.enhance_title(title) - parsed_message.description = description + parsed_message.text = message + parsed_message.clean_text = cleanup_telegram_message_text(message) + parsed_message.title = self.parse_message_title(parsed_message.clean_text) parsed_message.type = MessageType.TEXT return parsed_message - def parse_text(self, message): - matcher = re.match(self.SIMPLE_TEXT_POST, message.text) - if matcher is not None: - if len(matcher.groups()) > 1: - title = matcher.group(1) - description = matcher.group(2) - else: - title = matcher.group(1) - description = message.text - else: - title = message.text - description = message.text - return title, description - - def enhance_title(self, title): - matcher = re.match(self.MARKDOWN_BOLD, title) - return title.replace("*", "") if matcher is not None else title + @classmethod + def parse_message_title(cls, text): + if text: + return text.split("\n", 1)[0] + return "" def matches(self, channel, message): return message.text is not None and len(message.text) > 0