Add telegram message cleanup

This commit is contained in:
Vasily Zubarev
2020-02-18 15:32:35 +01:00
parent b6150ee968
commit 95da927828
3 changed files with 37 additions and 55 deletions

View File

@@ -0,0 +1,16 @@
import re
def cleanup_telegram_message_text(text):
text = str(text)
cleanup_expressions = [
r"\[.*?\]\(.*?\)", # attached files and images
r"#[\S]*", # hashtags
r"[\*\~\`]+", # stars and tildas
r"<[^>]*>", # html tags
r"[\n]{3,}", # triple newlines
]
for pattern in cleanup_expressions:
text = re.sub(pattern, "", text)
return text.strip()

View File

@@ -3,28 +3,18 @@ from enum import Enum
class TelegramChannel:
def __init__(
self, channel_id=None, title=None, link=None, description=None, messages=None
self,
channel_id=None,
title=None,
link=None,
description=None,
messages=None
):
self.channel_id = channel_id
self.title = title
self.link = link
self.description = description
self.messages = messages if messages is not None else []
def add_message(self, message):
self.messages.append(message)
def remove_message(self, message):
self.messages.remove(message)
def to_dict(self):
return {
"channel_id": self.channel_id,
"title": self.title,
"link": self.link,
"description": self.description,
"messages": list(map(lambda message: message.to_dict(), self.messages)),
}
self.messages = messages or []
class TelegramChannelMessage:
@@ -32,7 +22,8 @@ class TelegramChannelMessage:
self,
telegram_id=None,
title=None,
description=None,
text=None,
clean_text=None,
link=None,
channel=None,
grouped_id=None,
@@ -41,25 +32,14 @@ class TelegramChannelMessage:
):
self.telegram_id = telegram_id
self.title = title
self.description = description
self.text = text
self.clean_text = clean_text
self.link = link
self.channel = channel
self.grouped_id = grouped_id
self.type = type
self.timestamp = timestamp
def to_dict(self):
return {
"telegram_id": self.telegram_id,
"title": self.title,
"description": self.description,
"link": self.link,
"channel": self.channel.channel_id,
"grouped_id": self.grouped_id,
"type": self.type.value,
"timestamp": self.timestamp.strftime("%Y-%m-%d %H:%M:%S"),
}
class MessageType(Enum):
TEXT = "text"

View File

@@ -1,6 +1,7 @@
from abc import ABC, abstractmethod
from parsing.telegram.cleanup import cleanup_telegram_message_text
from parsing.telegram.models import TelegramChannelMessage, TelegramChannel, MessageType
import re
class Parser(ABC):
@@ -26,34 +27,19 @@ class Parser(ABC):
class SimpleTextParser(Parser):
MARKDOWN_BOLD = ".+\*.+\*.+"
SIMPLE_TEXT_POST = "(.+)\n+(.+)"
def parse(self, channel, message):
parsed_message = super().parse(channel, message)
(title, description) = self.parse_text(message)
parsed_message.title = self.enhance_title(title)
parsed_message.description = description
parsed_message.text = message
parsed_message.clean_text = cleanup_telegram_message_text(message)
parsed_message.title = self.parse_message_title(parsed_message.clean_text)
parsed_message.type = MessageType.TEXT
return parsed_message
def parse_text(self, message):
matcher = re.match(self.SIMPLE_TEXT_POST, message.text)
if matcher is not None:
if len(matcher.groups()) > 1:
title = matcher.group(1)
description = matcher.group(2)
else:
title = matcher.group(1)
description = message.text
else:
title = message.text
description = message.text
return title, description
def enhance_title(self, title):
matcher = re.match(self.MARKDOWN_BOLD, title)
return title.replace("*", "") if matcher is not None else title
@classmethod
def parse_message_title(cls, text):
if text:
return text.split("\n", 1)[0]
return ""
def matches(self, channel, message):
return message.text is not None and len(message.text) > 0