diff --git a/infomate/urls.py b/infomate/urls.py index 6c4c0db..59adcb7 100644 --- a/infomate/urls.py +++ b/infomate/urls.py @@ -13,7 +13,8 @@ urlpatterns = [ path("/", board, name="board"), - path("parsing/telegram//", + path("parsing/telegram//", cache_page(settings.TELEGRAM_CACHE_SECONDS)(TelegramChannelFeed()), - name="telegram_channel_feed") + name="telegram_channel_feed"), + ] diff --git a/parsing/telegram/cleanup.py b/parsing/telegram/cleanup.py deleted file mode 100644 index 570d18b..0000000 --- a/parsing/telegram/cleanup.py +++ /dev/null @@ -1,16 +0,0 @@ -import re - - -def cleanup_telegram_message_text(text): - text = str(text) - cleanup_expressions = [ - r"\[.*?\]\(.*?\)", # attached files and images - r"#[\S]*", # hashtags - r"[\*\~\`]+", # stars and tildas - r"<[^>]*>", # html tags - r"[\n]{3,}", # triple newlines - ] - for pattern in cleanup_expressions: - text = re.sub(pattern, "", text, flags=re.M) - - return text.strip() diff --git a/parsing/telegram/models.py b/parsing/telegram/models.py deleted file mode 100644 index 1bffe97..0000000 --- a/parsing/telegram/models.py +++ /dev/null @@ -1,49 +0,0 @@ -from enum import Enum - - -class TelegramChannel: - def __init__( - self, - channel_id=None, - title=None, - link=None, - description=None, - messages=None - ): - self.channel_id = channel_id - self.title = title - self.link = link - self.description = description - self.messages = messages or [] - - -class TelegramChannelMessage: - def __init__( - self, - telegram_id=None, - title=None, - text=None, - clean_text=None, - link=None, - channel=None, - grouped_id=None, - type=None, - timestamp=None, - ): - self.telegram_id = telegram_id - self.title = title - self.text = text - self.clean_text = clean_text - self.link = link - self.channel = channel - self.grouped_id = grouped_id - self.type = type - self.timestamp = timestamp - - -class MessageType(Enum): - TEXT = "text" - VIDEO = "video" - PHOTO = "photo" - VOICE = "voice" - FILE = "file" diff --git a/parsing/telegram/parser.py b/parsing/telegram/parser.py new file mode 100644 index 0000000..912f0fd --- /dev/null +++ b/parsing/telegram/parser.py @@ -0,0 +1,70 @@ +import re +from collections import namedtuple +from datetime import datetime + +import requests +from bs4 import BeautifulSoup + +from scripts.common import DEFAULT_REQUEST_TIMEOUT, DEFAULT_REQUEST_HEADERS + +TELEGRAM_CHANNEL_WEBVIEW_PREFIX = "https://t.me/s/" +BACKGROUND_IMAGE_RE = re.compile("url\('(https://.+?)'\)") + +TELEGRAM_MESSAGE_CLASS = ".tgme_widget_message" +TELEGRAM_MESSAGE_TEXT_CLASS = ".tgme_widget_message_text" +TELEGRAM_MESSAGE_PHOTO_CLASS = ".tgme_widget_message_photo_wrap" +TELEGRAM_MESSAGE_DATE_CLASS = ".tgme_widget_message_date" + +TelegramChannel = namedtuple("TelegramChannel", ["url", "name", "messages"]) +TelegramMessage = namedtuple("TelegramMessage", ["url", "text", "photo", "created_at"]) + + +def parse_channel(channel_name, only_text=False, limit=100) -> TelegramChannel: + channel_url = TELEGRAM_CHANNEL_WEBVIEW_PREFIX + channel_name + response = requests.get( + url=channel_url, + timeout=DEFAULT_REQUEST_TIMEOUT, + headers=DEFAULT_REQUEST_HEADERS, + ) + + bs = BeautifulSoup(response.text, features="lxml") + + messages = [] + message_tags = bs.select(TELEGRAM_MESSAGE_CLASS) + for message_tag in message_tags: + message_text = None + message_text_tag = message_tag.select(TELEGRAM_MESSAGE_TEXT_CLASS) + if message_text_tag: + message_text = str(message_text_tag[0]) + + message_photo = None + message_photo_tag = message_tag.select(TELEGRAM_MESSAGE_PHOTO_CLASS) + if message_photo_tag: + message_photo = BACKGROUND_IMAGE_RE.search(str(message_photo_tag[0])).group(1) + + message_url = None + message_time = datetime.utcnow() + message_date_tag = message_tag.select(TELEGRAM_MESSAGE_DATE_CLASS) + if message_date_tag: + message_url = message_date_tag[0]["href"] + message_datetime_tag = message_date_tag[0].select("time") + if message_datetime_tag: + message_time = datetime.fromisoformat(message_datetime_tag[0]["datetime"]) + + messages.append( + TelegramMessage( + url=message_url, + text=message_text, + photo=message_photo, + created_at=message_time, + ) + ) + + if only_text: + messages = [m for m in messages if m.text] + + return TelegramChannel( + url=channel_url, + name=channel_name, + messages=list(reversed(messages))[:limit], + ) diff --git a/parsing/telegram/parsers.py b/parsing/telegram/parsers.py deleted file mode 100644 index 999f899..0000000 --- a/parsing/telegram/parsers.py +++ /dev/null @@ -1,110 +0,0 @@ -from abc import ABC, abstractmethod - -from parsing.telegram.cleanup import cleanup_telegram_message_text -from parsing.telegram.models import TelegramChannelMessage, TelegramChannel, MessageType - - -class Parser(ABC): - def parse(self, channel, message): - return TelegramChannelMessage( - channel=channel, - link=_TELEGRAM_MESSAGE_LINK.format(channel.channel_id, message.id), - telegram_id=message.id, - grouped_id=message.grouped_id, - timestamp=message.date, - ) - - @abstractmethod - def matches(self, channel, message): - pass - - @staticmethod - def from_message(channel, message): - for parser in _messages_parsers: - if parser.matches(channel, message): - return parser - return None - - -class SimpleTextParser(Parser): - def parse(self, channel, message): - parsed_message = super().parse(channel, message) - parsed_message.text = message.text - parsed_message.clean_text = cleanup_telegram_message_text(message.text) - parsed_message.title = self.parse_message_title(parsed_message.clean_text) - parsed_message.type = MessageType.TEXT - return parsed_message - - @classmethod - def parse_message_title(cls, text): - if text: - return text.split("\n", 1)[0] - return "" - - def matches(self, channel, message): - return message.text is not None and len(message.text) > 0 - - -class VideoParser(Parser): - def parse(self, channel, message): - parsed_message = super().parse(channel, message) - parsed_message.title = "[video]" - parsed_message.type = MessageType.VIDEO - return parsed_message - - def matches(self, channel, message): - return message.video is not None - - -class PhotoParser(Parser): - def parse(self, channel, message): - parsed_message = super().parse(channel, message) - parsed_message.title = "[photo]" - parsed_message.type = MessageType.PHOTO - return parsed_message - - def matches(self, channel, message): - return message.photo is not None - - -class FileParser(Parser): - def parse(self, channel, message): - parsed_message = super().parse(channel, message) - parsed_message.title = "[file]" - parsed_message.type = MessageType.FILE - return parsed_message - - def matches(self, channel, message): - return message.file is not None - - -class VoiceParser(Parser): - def parse(self, channel, message): - parsed_message = super().parse(channel, message) - parsed_message.title = "[voice]" - parsed_message.type = MessageType.VOICE - return parsed_message - - def matches(self, channel, message): - return message.voice is not None - - -def parse_channel(channel_id, chat_full): - return TelegramChannel( - channel_id=channel_id, - title=chat_full.chats[0].title, - description=chat_full.full_chat.about, - link=_TELEGRAM_CHANNEL_LINK.format(chat_full.chats[0].username), - ) - - -_messages_parsers = [ - SimpleTextParser(), - VideoParser(), - PhotoParser(), - VoiceParser(), - FileParser(), -] - -_TELEGRAM_CHANNEL_LINK = "https://t.me/{}" -_TELEGRAM_MESSAGE_LINK = _TELEGRAM_CHANNEL_LINK + "/{}" diff --git a/parsing/telegram/telegram.py b/parsing/telegram/telegram.py deleted file mode 100644 index cdf82a7..0000000 --- a/parsing/telegram/telegram.py +++ /dev/null @@ -1,78 +0,0 @@ -from telethon.sync import TelegramClient, functions -from django.conf import settings - -from parsing.exceptions import ParsingException -from parsing.telegram.parsers import Parser, parse_channel -from parsing.telegram.models import MessageType -import asyncio - -DEFAULT_LIMIT = 30 - - -def get_channel(channel_id, *, types=None, limit=DEFAULT_LIMIT): - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - with TelegramClient( - settings.TELEGRAM_SESSION_FILE, - settings.TELEGRAM_APP_ID, - settings.TELEGRAM_APP_HASH, - loop=loop, - ) as client: - try: - channel = parse_channel( - channel_id, - client(functions.channels.GetFullChannelRequest(channel=channel_id)), - ) - except ValueError: - raise ParsingException(f"No channel named '{channel_id}'") - - channel.messages = get_channel_messages(client, channel, types=types, limit=limit) - - return channel - - -def get_channel_messages(client, channel, *, types=None, limit=DEFAULT_LIMIT): - def get_messages_indexes(messages, grouped_id, type=None, inverse=False): - type_predicate = lambda m: m != type if inverse else m == type - indexes = [] - for i, message in enumerate(messages): - if type_predicate(message) and message.grouped_id == grouped_id: - indexes.append(i) - return indexes - - def merge_messages(messages, new_message): - if new_message.type == MessageType.TEXT: - indexes = get_messages_indexes( - messages, new_message.grouped_id, type=MessageType.TEXT, inverse=True - ) - if len(indexes) > 0: - messages[indexes.pop()] = new_message - - for i in indexes: - try: - messages.remove(i) - except ValueError: - pass # skip missing messages - else: - indexes = get_messages_indexes( - messages, new_message.grouped_id, type=MessageType.TEXT - ) - if len(indexes) == 0: - messages.append(new_message) - - channel_messages = [] - - for t_message in client.iter_messages(channel.channel_id, limit=limit): - parser = Parser.from_message(channel, t_message) - if parser is not None: - message = parser.parse(channel, t_message) - if message is not None: - if types and message.type not in types: - continue - - if message.grouped_id is not None: - merge_messages(channel_messages, message) - else: - channel_messages.append(message) - - return channel_messages diff --git a/parsing/views.py b/parsing/views.py index 4f3caf3..ac9006a 100644 --- a/parsing/views.py +++ b/parsing/views.py @@ -1,47 +1,38 @@ from django.contrib.syndication.views import Feed -from django.http import Http404, HttpResponseBadRequest -from parsing.exceptions import ParsingException -from parsing.telegram.telegram import get_channel -from parsing.telegram.models import MessageType +from parsing.telegram.parser import parse_channel class TelegramChannelFeed(Feed): FEED_ITEMS = 30 - def get_object(self, request, channel): + def get_object(self, request, channel_name): limit = int(request.GET.get("size") or self.FEED_ITEMS) only = str(request.GET.get("only") or "") - if only: - try: - only = [MessageType(item.strip()) for item in only.split(",")] - except (KeyError, ValueError): - return HttpResponseBadRequest() - - limit = 100 # dirty hack: artificially increase the limit to get more messages after filtering - - try: - return get_channel(channel, types=only, limit=limit) - except ParsingException: - raise Http404() + return parse_channel(channel_name, only_text=only == "text", limit=limit) def title(self, obj): - return obj.title - - def link(self, obj): - return obj.link - - def description(self, obj): - return obj.description + return obj.name def items(self, obj): return obj.messages + def link(self, obj): + return obj.url + def item_title(self, item): - return item.title + return item.text def item_description(self, item): - return item.clean_text + result = "" + if item.photo: + result += f"
" + if item.text: + result += str(item.text) + return result def item_link(self, item): - return item.link + return item.url + + def item_pubdate(self, item): + return item.created_at diff --git a/requirements.txt b/requirements.txt index ec1d060..d612420 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,5 +10,4 @@ feedparser==5.2.1 sentry-sdk==0.14.1 nltk==3.4.5 newspaper3k>=0.2.8 -telethon==1.18.2 django-bleach==0.6.1 \ No newline at end of file diff --git a/setup_telegram.py b/setup_telegram.py deleted file mode 100644 index af0e4d4..0000000 --- a/setup_telegram.py +++ /dev/null @@ -1,11 +0,0 @@ -import os -os.environ.setdefault("DJANGO_SETTINGS_MODULE", "infomate.settings") -import django -django.setup() - -from telethon.sync import TelegramClient -from django.conf import settings - - -with TelegramClient(settings.TELEGRAM_SESSION_FILE, settings.TELEGRAM_APP_ID, settings.TELEGRAM_APP_HASH) as client: - print("Successfully setup Telegram session.")