Add telegram message cleanup
This commit is contained in:
16
parsing/telegram/cleanup.py
Normal file
16
parsing/telegram/cleanup.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import re
|
||||
|
||||
|
||||
def cleanup_telegram_message_text(text):
|
||||
text = str(text)
|
||||
cleanup_expressions = [
|
||||
r"\[.*?\]\(.*?\)", # attached files and images
|
||||
r"#[\S]*", # hashtags
|
||||
r"[\*\~\`]+", # stars and tildas
|
||||
r"<[^>]*>", # html tags
|
||||
r"[\n]{3,}", # triple newlines
|
||||
]
|
||||
for pattern in cleanup_expressions:
|
||||
text = re.sub(pattern, "", text)
|
||||
|
||||
return text.strip()
|
||||
@@ -3,28 +3,18 @@ from enum import Enum
|
||||
|
||||
class TelegramChannel:
|
||||
def __init__(
|
||||
self, channel_id=None, title=None, link=None, description=None, messages=None
|
||||
self,
|
||||
channel_id=None,
|
||||
title=None,
|
||||
link=None,
|
||||
description=None,
|
||||
messages=None
|
||||
):
|
||||
self.channel_id = channel_id
|
||||
self.title = title
|
||||
self.link = link
|
||||
self.description = description
|
||||
self.messages = messages if messages is not None else []
|
||||
|
||||
def add_message(self, message):
|
||||
self.messages.append(message)
|
||||
|
||||
def remove_message(self, message):
|
||||
self.messages.remove(message)
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"channel_id": self.channel_id,
|
||||
"title": self.title,
|
||||
"link": self.link,
|
||||
"description": self.description,
|
||||
"messages": list(map(lambda message: message.to_dict(), self.messages)),
|
||||
}
|
||||
self.messages = messages or []
|
||||
|
||||
|
||||
class TelegramChannelMessage:
|
||||
@@ -32,7 +22,8 @@ class TelegramChannelMessage:
|
||||
self,
|
||||
telegram_id=None,
|
||||
title=None,
|
||||
description=None,
|
||||
text=None,
|
||||
clean_text=None,
|
||||
link=None,
|
||||
channel=None,
|
||||
grouped_id=None,
|
||||
@@ -41,25 +32,14 @@ class TelegramChannelMessage:
|
||||
):
|
||||
self.telegram_id = telegram_id
|
||||
self.title = title
|
||||
self.description = description
|
||||
self.text = text
|
||||
self.clean_text = clean_text
|
||||
self.link = link
|
||||
self.channel = channel
|
||||
self.grouped_id = grouped_id
|
||||
self.type = type
|
||||
self.timestamp = timestamp
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"telegram_id": self.telegram_id,
|
||||
"title": self.title,
|
||||
"description": self.description,
|
||||
"link": self.link,
|
||||
"channel": self.channel.channel_id,
|
||||
"grouped_id": self.grouped_id,
|
||||
"type": self.type.value,
|
||||
"timestamp": self.timestamp.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
}
|
||||
|
||||
|
||||
class MessageType(Enum):
|
||||
TEXT = "text"
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from parsing.telegram.cleanup import cleanup_telegram_message_text
|
||||
from parsing.telegram.models import TelegramChannelMessage, TelegramChannel, MessageType
|
||||
import re
|
||||
|
||||
|
||||
class Parser(ABC):
|
||||
@@ -26,34 +27,19 @@ class Parser(ABC):
|
||||
|
||||
|
||||
class SimpleTextParser(Parser):
|
||||
MARKDOWN_BOLD = ".+\*.+\*.+"
|
||||
SIMPLE_TEXT_POST = "(.+)\n+(.+)"
|
||||
|
||||
def parse(self, channel, message):
|
||||
parsed_message = super().parse(channel, message)
|
||||
(title, description) = self.parse_text(message)
|
||||
parsed_message.title = self.enhance_title(title)
|
||||
parsed_message.description = description
|
||||
parsed_message.text = message
|
||||
parsed_message.clean_text = cleanup_telegram_message_text(message)
|
||||
parsed_message.title = self.parse_message_title(parsed_message.clean_text)
|
||||
parsed_message.type = MessageType.TEXT
|
||||
return parsed_message
|
||||
|
||||
def parse_text(self, message):
|
||||
matcher = re.match(self.SIMPLE_TEXT_POST, message.text)
|
||||
if matcher is not None:
|
||||
if len(matcher.groups()) > 1:
|
||||
title = matcher.group(1)
|
||||
description = matcher.group(2)
|
||||
else:
|
||||
title = matcher.group(1)
|
||||
description = message.text
|
||||
else:
|
||||
title = message.text
|
||||
description = message.text
|
||||
return title, description
|
||||
|
||||
def enhance_title(self, title):
|
||||
matcher = re.match(self.MARKDOWN_BOLD, title)
|
||||
return title.replace("*", "") if matcher is not None else title
|
||||
@classmethod
|
||||
def parse_message_title(cls, text):
|
||||
if text:
|
||||
return text.split("\n", 1)[0]
|
||||
return ""
|
||||
|
||||
def matches(self, channel, message):
|
||||
return message.text is not None and len(message.text) > 0
|
||||
|
||||
Reference in New Issue
Block a user