Telegram -> RSS parser (#16)
* Created first dev version of Telegram channels parser * Created endpoint for get Telegram Channel RSS * Added *.session and venv/ in .gitignore * Added dependency telethon * Added script for init Telethon * Removed unnecessary script parse.py * Added possibility for dynamic generate RSS feeds from Telegram channel * Deleted saving Telegram messages in database * Minor refactor Created separated files: parsers.py and utils.py * Enhanced text parser: parse title and description if can * Added attachment parsers: video, photo, voice and file parsers * Added possibility for merge grouped messages into one item * Renamed url for parse Telegram channel from "rss/telegram/<str:channel>" to "parsing/telegram/<str:channel>/" * Format code with black * Refactor: created package parsing.telegram and rename utils to models * Added Makefile target telegram
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -36,3 +36,6 @@ local_settings.py
|
||||
media/images
|
||||
media/i
|
||||
boards_private.yml
|
||||
|
||||
*.session
|
||||
venv/
|
||||
3
Makefile
3
Makefile
@@ -39,6 +39,9 @@ mypy: ## Check types with mypy
|
||||
run: ## Runs dev server
|
||||
@python3 manage.py runserver
|
||||
|
||||
telegram:
|
||||
@python3 setup_telegram.py
|
||||
|
||||
test-ci: test-requirements lint mypy ## Run tests (intended for CI usage)
|
||||
|
||||
test-requirements: ## Install requirements to run tests
|
||||
|
||||
@@ -17,6 +17,7 @@ INSTALLED_APPS = [
|
||||
"django.contrib.humanize",
|
||||
"auth",
|
||||
"boards",
|
||||
"parsing"
|
||||
]
|
||||
|
||||
MIDDLEWARE = [
|
||||
@@ -116,6 +117,10 @@ SENTRY_DSN = None
|
||||
MEDIA_UPLOAD_URL = "https://i.vas3k.ru/upload/"
|
||||
MEDIA_UPLOAD_CODE = None # should be set in private_settings.py
|
||||
|
||||
TELEGRAM_APP_ID = None # should set in private_settings.py
|
||||
TELEGRAM_APP_HASH = None # should set in private_settings.py
|
||||
TELEGRAM_SESSION_FILE = None # should set in private settings.py
|
||||
|
||||
try:
|
||||
# poor mans' private settings
|
||||
# As due to obvious reasons this file is missing in the repository, suppress the following 'pyflakes' error codes:
|
||||
|
||||
@@ -2,6 +2,7 @@ from django.urls import path
|
||||
|
||||
from auth.views import login, logout, club_callback
|
||||
from boards.views import index, board, privacy_policy, what, export
|
||||
from parsing.views import TelegramChannelFeed
|
||||
|
||||
urlpatterns = [
|
||||
path("", index, name="index"),
|
||||
@@ -15,4 +16,5 @@ urlpatterns = [
|
||||
|
||||
path("<slug:board_slug>/", board, name="board"),
|
||||
path("<slug:board_slug>/export/", export, name="export"),
|
||||
path("parsing/telegram/<str:channel>/", TelegramChannelFeed(), name="telegram_channel_feed")
|
||||
]
|
||||
|
||||
0
parsing/__init__.py
Normal file
0
parsing/__init__.py
Normal file
5
parsing/apps.py
Normal file
5
parsing/apps.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class ParsingConfig(AppConfig):
|
||||
name = "parsing"
|
||||
0
parsing/telegram/__init__.py
Normal file
0
parsing/telegram/__init__.py
Normal file
69
parsing/telegram/models.py
Normal file
69
parsing/telegram/models.py
Normal file
@@ -0,0 +1,69 @@
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class TelegramChannel:
|
||||
def __init__(
|
||||
self, channel_id=None, title=None, link=None, description=None, messages=None
|
||||
):
|
||||
self.channel_id = channel_id
|
||||
self.title = title
|
||||
self.link = link
|
||||
self.description = description
|
||||
self.messages = messages if messages is not None else []
|
||||
|
||||
def add_message(self, message):
|
||||
self.messages.append(message)
|
||||
|
||||
def remove_message(self, message):
|
||||
self.messages.remove(message)
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"channel_id": self.channel_id,
|
||||
"title": self.title,
|
||||
"link": self.link,
|
||||
"description": self.description,
|
||||
"messages": list(map(lambda message: message.to_dict(), self.messages)),
|
||||
}
|
||||
|
||||
|
||||
class TelegramChannelMessage:
|
||||
def __init__(
|
||||
self,
|
||||
telegram_id=None,
|
||||
title=None,
|
||||
description=None,
|
||||
link=None,
|
||||
channel=None,
|
||||
grouped_id=None,
|
||||
type=None,
|
||||
timestamp=None,
|
||||
):
|
||||
self.telegram_id = telegram_id
|
||||
self.title = title
|
||||
self.description = description
|
||||
self.link = link
|
||||
self.channel = channel
|
||||
self.grouped_id = grouped_id
|
||||
self.type = type
|
||||
self.timestamp = timestamp
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"telegram_id": self.telegram_id,
|
||||
"title": self.title,
|
||||
"description": self.description,
|
||||
"link": self.link,
|
||||
"channel": self.channel.channel_id,
|
||||
"grouped_id": self.grouped_id,
|
||||
"type": self.type.value,
|
||||
"timestamp": self.timestamp.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
}
|
||||
|
||||
|
||||
class MessageType(Enum):
|
||||
TEXT = "text"
|
||||
VIDEO = "video"
|
||||
PHOTO = "photo"
|
||||
VOICE = "voice"
|
||||
FILE = "file"
|
||||
124
parsing/telegram/parsers.py
Normal file
124
parsing/telegram/parsers.py
Normal file
@@ -0,0 +1,124 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from parsing.telegram.models import TelegramChannelMessage, TelegramChannel, MessageType
|
||||
import re
|
||||
|
||||
|
||||
class Parser(ABC):
|
||||
def parse(self, channel, message):
|
||||
return TelegramChannelMessage(
|
||||
channel=channel,
|
||||
link=_TELEGRAM_MESSAGE_LINK.format(channel.channel_id, message.id),
|
||||
telegram_id=message.id,
|
||||
grouped_id=message.grouped_id,
|
||||
timestamp=message.date,
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def matches(self, channel, message):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def from_message(channel, message):
|
||||
for parser in _messages_parsers:
|
||||
if parser.matches(channel, message):
|
||||
return parser
|
||||
return None
|
||||
|
||||
|
||||
class SimpleTextParser(Parser):
|
||||
MARKDOWN_BOLD = ".+\*.+\*.+"
|
||||
SIMPLE_TEXT_POST = "(.+)\n+(.+)"
|
||||
|
||||
def parse(self, channel, message):
|
||||
parsed_message = super().parse(channel, message)
|
||||
(title, description) = self.parse_text(message)
|
||||
parsed_message.title = self.enhance_title(title)
|
||||
parsed_message.description = description
|
||||
parsed_message.type = MessageType.TEXT
|
||||
return parsed_message
|
||||
|
||||
def parse_text(self, message):
|
||||
matcher = re.match(self.SIMPLE_TEXT_POST, message.text)
|
||||
if matcher is not None:
|
||||
if len(matcher.groups()) > 1:
|
||||
title = matcher.group(1)
|
||||
description = matcher.group(2)
|
||||
else:
|
||||
title = matcher.group(1)
|
||||
description = message.text
|
||||
else:
|
||||
title = message.text
|
||||
description = message.text
|
||||
return title, description
|
||||
|
||||
def enhance_title(self, title):
|
||||
matcher = re.match(self.MARKDOWN_BOLD, title)
|
||||
return title.replace("*", "") if matcher is not None else title
|
||||
|
||||
def matches(self, channel, message):
|
||||
return message.text is not None and len(message.text) > 0
|
||||
|
||||
|
||||
class VideoParser(Parser):
|
||||
def parse(self, channel, message):
|
||||
parsed_message = super().parse(channel, message)
|
||||
parsed_message.title = "[video]"
|
||||
parsed_message.type = MessageType.VIDEO
|
||||
return parsed_message
|
||||
|
||||
def matches(self, channel, message):
|
||||
return message.video is not None
|
||||
|
||||
|
||||
class PhotoParser(Parser):
|
||||
def parse(self, channel, message):
|
||||
parsed_message = super().parse(channel, message)
|
||||
parsed_message.title = "[photo]"
|
||||
parsed_message.type = MessageType.PHOTO
|
||||
return parsed_message
|
||||
|
||||
def matches(self, channel, message):
|
||||
return message.photo is not None
|
||||
|
||||
|
||||
class FileParser(Parser):
|
||||
def parse(self, channel, message):
|
||||
parsed_message = super().parse(channel, message)
|
||||
parsed_message.title = "[file]"
|
||||
parsed_message.type = MessageType.FILE
|
||||
return parsed_message
|
||||
|
||||
def matches(self, channel, message):
|
||||
return message.file is not None
|
||||
|
||||
|
||||
class VoiceParser(Parser):
|
||||
def parse(self, channel, message):
|
||||
parsed_message = super().parse(channel, message)
|
||||
parsed_message.title = "[voice]"
|
||||
parsed_message.type = MessageType.VOICE
|
||||
return parsed_message
|
||||
|
||||
def matches(self, channel, message):
|
||||
return message.voice is not None
|
||||
|
||||
|
||||
def parse_channel(channel_id, chat_full):
|
||||
return TelegramChannel(
|
||||
channel_id=channel_id,
|
||||
title=chat_full.chats[0].title,
|
||||
description=chat_full.full_chat.about,
|
||||
link=_TELEGRAM_CHANNEL_LINK.format(chat_full.chats[0].username),
|
||||
)
|
||||
|
||||
|
||||
_messages_parsers = [
|
||||
SimpleTextParser(),
|
||||
VideoParser(),
|
||||
PhotoParser(),
|
||||
VoiceParser(),
|
||||
FileParser(),
|
||||
]
|
||||
|
||||
_TELEGRAM_CHANNEL_LINK = "https://t.me/{}"
|
||||
_TELEGRAM_MESSAGE_LINK = _TELEGRAM_CHANNEL_LINK + "/{}"
|
||||
63
parsing/telegram/telegram.py
Normal file
63
parsing/telegram/telegram.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from telethon.sync import TelegramClient, functions
|
||||
from django.conf import settings
|
||||
from parsing.telegram.parsers import Parser, parse_channel
|
||||
from parsing.telegram.models import MessageType
|
||||
import asyncio
|
||||
|
||||
|
||||
def get_channel(channel_id, messages_limit):
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
with TelegramClient(
|
||||
settings.TELEGRAM_SESSION_FILE,
|
||||
settings.TELEGRAM_APP_ID,
|
||||
settings.TELEGRAM_APP_HASH,
|
||||
loop=loop,
|
||||
) as client:
|
||||
channel = parse_channel(
|
||||
channel_id,
|
||||
client(functions.channels.GetFullChannelRequest(channel=channel_id)),
|
||||
)
|
||||
channel.messages = __get_channel_messages(client, channel, messages_limit)
|
||||
return channel
|
||||
|
||||
|
||||
def __get_channel_messages(client, channel, messages_limit):
|
||||
def get_messages_indexes(messages, grouped_id, type=None, inverse=False):
|
||||
type_predicate = lambda m: m != type if inverse else m == type
|
||||
indexes = []
|
||||
for i in range(len(messages)):
|
||||
message = messages[i]
|
||||
if type_predicate(message) and message.grouped_id == grouped_id:
|
||||
indexes.append(i)
|
||||
return indexes
|
||||
|
||||
def merge_messages(messages, new_message):
|
||||
if new_message.type == MessageType.TEXT:
|
||||
indexes = get_messages_indexes(
|
||||
messages, new_message.grouped_id, type=MessageType.TEXT, inverse=True
|
||||
)
|
||||
if len(indexes) > 0:
|
||||
messages[indexes.pop()] = new_message
|
||||
|
||||
for i in indexes:
|
||||
messages.remove(i)
|
||||
else:
|
||||
indexes = get_messages_indexes(
|
||||
messages, new_message.grouped_id, type=MessageType.TEXT
|
||||
)
|
||||
if len(indexes) == 0:
|
||||
messages.append(new_message)
|
||||
|
||||
channel_messages = []
|
||||
for t_message in client.iter_messages(channel.channel_id, limit=messages_limit):
|
||||
parser = Parser.from_message(channel, t_message)
|
||||
if parser is not None:
|
||||
message = parser.parse(channel, t_message)
|
||||
if message is not None:
|
||||
if message.grouped_id is not None:
|
||||
merge_messages(channel_messages, message)
|
||||
else:
|
||||
channel_messages.append(message)
|
||||
|
||||
return channel_messages
|
||||
34
parsing/views.py
Normal file
34
parsing/views.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from django.contrib.syndication.views import Feed
|
||||
from parsing.telegram.telegram import get_channel
|
||||
|
||||
|
||||
class TelegramChannelFeed(Feed):
|
||||
|
||||
FEED_ITEMS = 30
|
||||
|
||||
def get_object(self, request, channel):
|
||||
feed_items = (
|
||||
int(request.GET["size"]) if "size" in request.GET else self.FEED_ITEMS
|
||||
)
|
||||
return get_channel(channel, feed_items)
|
||||
|
||||
def title(self, obj):
|
||||
return obj.title
|
||||
|
||||
def link(self, obj):
|
||||
return obj.link
|
||||
|
||||
def description(self, obj):
|
||||
return obj.description
|
||||
|
||||
def items(self, obj):
|
||||
return obj.messages
|
||||
|
||||
def item_title(self, item):
|
||||
return item.title
|
||||
|
||||
def item_description(self, item):
|
||||
return item.description
|
||||
|
||||
def item_link(self, item):
|
||||
return item.link
|
||||
@@ -11,3 +11,4 @@ sentry-sdk==0.13.5
|
||||
pyjwt==1.7.1
|
||||
nltk==3.4.5
|
||||
newspaper3k>=0.2.8
|
||||
telethon==1.10.10
|
||||
|
||||
11
setup_telegram.py
Normal file
11
setup_telegram.py
Normal file
@@ -0,0 +1,11 @@
|
||||
import os
|
||||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "infomate.settings")
|
||||
import django
|
||||
django.setup()
|
||||
|
||||
from telethon.sync import TelegramClient
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
with TelegramClient(settings.TELEGRAM_SESSION_FILE, settings.TELEGRAM_APP_ID, settings.TELEGRAM_APP_HASH) as client:
|
||||
print("Successfully setup Telegram session.")
|
||||
Reference in New Issue
Block a user