diff --git a/.gitignore b/.gitignore index 6b4031b..c8dcd86 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,6 @@ local_settings.py media/images media/i boards_private.yml + +*.session +venv/ \ No newline at end of file diff --git a/Makefile b/Makefile index 52a1ac3..48d7d25 100644 --- a/Makefile +++ b/Makefile @@ -39,6 +39,9 @@ mypy: ## Check types with mypy run: ## Runs dev server @python3 manage.py runserver +telegram: + @python3 setup_telegram.py + test-ci: test-requirements lint mypy ## Run tests (intended for CI usage) test-requirements: ## Install requirements to run tests diff --git a/infomate/settings.py b/infomate/settings.py index 08bb0a2..7af16a2 100644 --- a/infomate/settings.py +++ b/infomate/settings.py @@ -17,6 +17,7 @@ INSTALLED_APPS = [ "django.contrib.humanize", "auth", "boards", + "parsing" ] MIDDLEWARE = [ @@ -116,6 +117,10 @@ SENTRY_DSN = None MEDIA_UPLOAD_URL = "https://i.vas3k.ru/upload/" MEDIA_UPLOAD_CODE = None # should be set in private_settings.py +TELEGRAM_APP_ID = None # should set in private_settings.py +TELEGRAM_APP_HASH = None # should set in private_settings.py +TELEGRAM_SESSION_FILE = None # should set in private settings.py + try: # poor mans' private settings # As due to obvious reasons this file is missing in the repository, suppress the following 'pyflakes' error codes: diff --git a/infomate/urls.py b/infomate/urls.py index f0a89e5..3b416c5 100644 --- a/infomate/urls.py +++ b/infomate/urls.py @@ -2,6 +2,7 @@ from django.urls import path from auth.views import login, logout, club_callback from boards.views import index, board, privacy_policy, what, export +from parsing.views import TelegramChannelFeed urlpatterns = [ path("", index, name="index"), @@ -15,4 +16,5 @@ urlpatterns = [ path("/", board, name="board"), path("/export/", export, name="export"), + path("parsing/telegram//", TelegramChannelFeed(), name="telegram_channel_feed") ] diff --git a/parsing/__init__.py b/parsing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parsing/apps.py b/parsing/apps.py new file mode 100644 index 0000000..51ccaa3 --- /dev/null +++ b/parsing/apps.py @@ -0,0 +1,5 @@ +from django.apps import AppConfig + + +class ParsingConfig(AppConfig): + name = "parsing" diff --git a/parsing/telegram/__init__.py b/parsing/telegram/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parsing/telegram/models.py b/parsing/telegram/models.py new file mode 100644 index 0000000..7e3c7ce --- /dev/null +++ b/parsing/telegram/models.py @@ -0,0 +1,69 @@ +from enum import Enum + + +class TelegramChannel: + def __init__( + self, channel_id=None, title=None, link=None, description=None, messages=None + ): + self.channel_id = channel_id + self.title = title + self.link = link + self.description = description + self.messages = messages if messages is not None else [] + + def add_message(self, message): + self.messages.append(message) + + def remove_message(self, message): + self.messages.remove(message) + + def to_dict(self): + return { + "channel_id": self.channel_id, + "title": self.title, + "link": self.link, + "description": self.description, + "messages": list(map(lambda message: message.to_dict(), self.messages)), + } + + +class TelegramChannelMessage: + def __init__( + self, + telegram_id=None, + title=None, + description=None, + link=None, + channel=None, + grouped_id=None, + type=None, + timestamp=None, + ): + self.telegram_id = telegram_id + self.title = title + self.description = description + self.link = link + self.channel = channel + self.grouped_id = grouped_id + self.type = type + self.timestamp = timestamp + + def to_dict(self): + return { + "telegram_id": self.telegram_id, + "title": self.title, + "description": self.description, + "link": self.link, + "channel": self.channel.channel_id, + "grouped_id": self.grouped_id, + "type": self.type.value, + "timestamp": self.timestamp.strftime("%Y-%m-%d %H:%M:%S"), + } + + +class MessageType(Enum): + TEXT = "text" + VIDEO = "video" + PHOTO = "photo" + VOICE = "voice" + FILE = "file" diff --git a/parsing/telegram/parsers.py b/parsing/telegram/parsers.py new file mode 100644 index 0000000..80793bc --- /dev/null +++ b/parsing/telegram/parsers.py @@ -0,0 +1,124 @@ +from abc import ABC, abstractmethod +from parsing.telegram.models import TelegramChannelMessage, TelegramChannel, MessageType +import re + + +class Parser(ABC): + def parse(self, channel, message): + return TelegramChannelMessage( + channel=channel, + link=_TELEGRAM_MESSAGE_LINK.format(channel.channel_id, message.id), + telegram_id=message.id, + grouped_id=message.grouped_id, + timestamp=message.date, + ) + + @abstractmethod + def matches(self, channel, message): + pass + + @staticmethod + def from_message(channel, message): + for parser in _messages_parsers: + if parser.matches(channel, message): + return parser + return None + + +class SimpleTextParser(Parser): + MARKDOWN_BOLD = ".+\*.+\*.+" + SIMPLE_TEXT_POST = "(.+)\n+(.+)" + + def parse(self, channel, message): + parsed_message = super().parse(channel, message) + (title, description) = self.parse_text(message) + parsed_message.title = self.enhance_title(title) + parsed_message.description = description + parsed_message.type = MessageType.TEXT + return parsed_message + + def parse_text(self, message): + matcher = re.match(self.SIMPLE_TEXT_POST, message.text) + if matcher is not None: + if len(matcher.groups()) > 1: + title = matcher.group(1) + description = matcher.group(2) + else: + title = matcher.group(1) + description = message.text + else: + title = message.text + description = message.text + return title, description + + def enhance_title(self, title): + matcher = re.match(self.MARKDOWN_BOLD, title) + return title.replace("*", "") if matcher is not None else title + + def matches(self, channel, message): + return message.text is not None and len(message.text) > 0 + + +class VideoParser(Parser): + def parse(self, channel, message): + parsed_message = super().parse(channel, message) + parsed_message.title = "[video]" + parsed_message.type = MessageType.VIDEO + return parsed_message + + def matches(self, channel, message): + return message.video is not None + + +class PhotoParser(Parser): + def parse(self, channel, message): + parsed_message = super().parse(channel, message) + parsed_message.title = "[photo]" + parsed_message.type = MessageType.PHOTO + return parsed_message + + def matches(self, channel, message): + return message.photo is not None + + +class FileParser(Parser): + def parse(self, channel, message): + parsed_message = super().parse(channel, message) + parsed_message.title = "[file]" + parsed_message.type = MessageType.FILE + return parsed_message + + def matches(self, channel, message): + return message.file is not None + + +class VoiceParser(Parser): + def parse(self, channel, message): + parsed_message = super().parse(channel, message) + parsed_message.title = "[voice]" + parsed_message.type = MessageType.VOICE + return parsed_message + + def matches(self, channel, message): + return message.voice is not None + + +def parse_channel(channel_id, chat_full): + return TelegramChannel( + channel_id=channel_id, + title=chat_full.chats[0].title, + description=chat_full.full_chat.about, + link=_TELEGRAM_CHANNEL_LINK.format(chat_full.chats[0].username), + ) + + +_messages_parsers = [ + SimpleTextParser(), + VideoParser(), + PhotoParser(), + VoiceParser(), + FileParser(), +] + +_TELEGRAM_CHANNEL_LINK = "https://t.me/{}" +_TELEGRAM_MESSAGE_LINK = _TELEGRAM_CHANNEL_LINK + "/{}" diff --git a/parsing/telegram/telegram.py b/parsing/telegram/telegram.py new file mode 100644 index 0000000..138b6a1 --- /dev/null +++ b/parsing/telegram/telegram.py @@ -0,0 +1,63 @@ +from telethon.sync import TelegramClient, functions +from django.conf import settings +from parsing.telegram.parsers import Parser, parse_channel +from parsing.telegram.models import MessageType +import asyncio + + +def get_channel(channel_id, messages_limit): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + with TelegramClient( + settings.TELEGRAM_SESSION_FILE, + settings.TELEGRAM_APP_ID, + settings.TELEGRAM_APP_HASH, + loop=loop, + ) as client: + channel = parse_channel( + channel_id, + client(functions.channels.GetFullChannelRequest(channel=channel_id)), + ) + channel.messages = __get_channel_messages(client, channel, messages_limit) + return channel + + +def __get_channel_messages(client, channel, messages_limit): + def get_messages_indexes(messages, grouped_id, type=None, inverse=False): + type_predicate = lambda m: m != type if inverse else m == type + indexes = [] + for i in range(len(messages)): + message = messages[i] + if type_predicate(message) and message.grouped_id == grouped_id: + indexes.append(i) + return indexes + + def merge_messages(messages, new_message): + if new_message.type == MessageType.TEXT: + indexes = get_messages_indexes( + messages, new_message.grouped_id, type=MessageType.TEXT, inverse=True + ) + if len(indexes) > 0: + messages[indexes.pop()] = new_message + + for i in indexes: + messages.remove(i) + else: + indexes = get_messages_indexes( + messages, new_message.grouped_id, type=MessageType.TEXT + ) + if len(indexes) == 0: + messages.append(new_message) + + channel_messages = [] + for t_message in client.iter_messages(channel.channel_id, limit=messages_limit): + parser = Parser.from_message(channel, t_message) + if parser is not None: + message = parser.parse(channel, t_message) + if message is not None: + if message.grouped_id is not None: + merge_messages(channel_messages, message) + else: + channel_messages.append(message) + + return channel_messages diff --git a/parsing/views.py b/parsing/views.py new file mode 100644 index 0000000..5415ca3 --- /dev/null +++ b/parsing/views.py @@ -0,0 +1,34 @@ +from django.contrib.syndication.views import Feed +from parsing.telegram.telegram import get_channel + + +class TelegramChannelFeed(Feed): + + FEED_ITEMS = 30 + + def get_object(self, request, channel): + feed_items = ( + int(request.GET["size"]) if "size" in request.GET else self.FEED_ITEMS + ) + return get_channel(channel, feed_items) + + def title(self, obj): + return obj.title + + def link(self, obj): + return obj.link + + def description(self, obj): + return obj.description + + def items(self, obj): + return obj.messages + + def item_title(self, item): + return item.title + + def item_description(self, item): + return item.description + + def item_link(self, item): + return item.link diff --git a/requirements.txt b/requirements.txt index 609c6d8..2d76e85 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ sentry-sdk==0.13.5 pyjwt==1.7.1 nltk==3.4.5 newspaper3k>=0.2.8 +telethon==1.10.10 diff --git a/setup_telegram.py b/setup_telegram.py new file mode 100644 index 0000000..af0e4d4 --- /dev/null +++ b/setup_telegram.py @@ -0,0 +1,11 @@ +import os +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "infomate.settings") +import django +django.setup() + +from telethon.sync import TelegramClient +from django.conf import settings + + +with TelegramClient(settings.TELEGRAM_SESSION_FILE, settings.TELEGRAM_APP_ID, settings.TELEGRAM_APP_HASH) as client: + print("Successfully setup Telegram session.")