Telegram -> RSS parser (#16)

* Created first dev version of Telegram channels parser

* Created endpoint for get Telegram Channel RSS

* Added *.session and venv/ in .gitignore

* Added dependency telethon

* Added script for init Telethon

* Removed unnecessary script parse.py

* Added possibility for dynamic generate RSS feeds from Telegram channel

* Deleted saving Telegram messages in database

* Minor refactor

Created separated files: parsers.py and utils.py

* Enhanced text parser: parse title and description if can

* Added attachment parsers: video, photo, voice and file parsers

* Added possibility for merge grouped messages into one item

* Renamed url for parse Telegram channel from "rss/telegram/<str:channel>" to "parsing/telegram/<str:channel>/"

* Format code with black

* Refactor: created package parsing.telegram and rename utils to models

* Added Makefile target telegram
This commit is contained in:
Vitalii Honchar
2020-02-12 11:57:52 +02:00
committed by GitHub
parent ca4bba46c2
commit 2eaa9fa65c
13 changed files with 320 additions and 0 deletions

3
.gitignore vendored
View File

@@ -36,3 +36,6 @@ local_settings.py
media/images
media/i
boards_private.yml
*.session
venv/

View File

@@ -39,6 +39,9 @@ mypy: ## Check types with mypy
run: ## Runs dev server
@python3 manage.py runserver
telegram:
@python3 setup_telegram.py
test-ci: test-requirements lint mypy ## Run tests (intended for CI usage)
test-requirements: ## Install requirements to run tests

View File

@@ -17,6 +17,7 @@ INSTALLED_APPS = [
"django.contrib.humanize",
"auth",
"boards",
"parsing"
]
MIDDLEWARE = [
@@ -116,6 +117,10 @@ SENTRY_DSN = None
MEDIA_UPLOAD_URL = "https://i.vas3k.ru/upload/"
MEDIA_UPLOAD_CODE = None # should be set in private_settings.py
TELEGRAM_APP_ID = None # should set in private_settings.py
TELEGRAM_APP_HASH = None # should set in private_settings.py
TELEGRAM_SESSION_FILE = None # should set in private settings.py
try:
# poor mans' private settings
# As due to obvious reasons this file is missing in the repository, suppress the following 'pyflakes' error codes:

View File

@@ -2,6 +2,7 @@ from django.urls import path
from auth.views import login, logout, club_callback
from boards.views import index, board, privacy_policy, what, export
from parsing.views import TelegramChannelFeed
urlpatterns = [
path("", index, name="index"),
@@ -15,4 +16,5 @@ urlpatterns = [
path("<slug:board_slug>/", board, name="board"),
path("<slug:board_slug>/export/", export, name="export"),
path("parsing/telegram/<str:channel>/", TelegramChannelFeed(), name="telegram_channel_feed")
]

0
parsing/__init__.py Normal file
View File

5
parsing/apps.py Normal file
View File

@@ -0,0 +1,5 @@
from django.apps import AppConfig
class ParsingConfig(AppConfig):
name = "parsing"

View File

View File

@@ -0,0 +1,69 @@
from enum import Enum
class TelegramChannel:
def __init__(
self, channel_id=None, title=None, link=None, description=None, messages=None
):
self.channel_id = channel_id
self.title = title
self.link = link
self.description = description
self.messages = messages if messages is not None else []
def add_message(self, message):
self.messages.append(message)
def remove_message(self, message):
self.messages.remove(message)
def to_dict(self):
return {
"channel_id": self.channel_id,
"title": self.title,
"link": self.link,
"description": self.description,
"messages": list(map(lambda message: message.to_dict(), self.messages)),
}
class TelegramChannelMessage:
def __init__(
self,
telegram_id=None,
title=None,
description=None,
link=None,
channel=None,
grouped_id=None,
type=None,
timestamp=None,
):
self.telegram_id = telegram_id
self.title = title
self.description = description
self.link = link
self.channel = channel
self.grouped_id = grouped_id
self.type = type
self.timestamp = timestamp
def to_dict(self):
return {
"telegram_id": self.telegram_id,
"title": self.title,
"description": self.description,
"link": self.link,
"channel": self.channel.channel_id,
"grouped_id": self.grouped_id,
"type": self.type.value,
"timestamp": self.timestamp.strftime("%Y-%m-%d %H:%M:%S"),
}
class MessageType(Enum):
TEXT = "text"
VIDEO = "video"
PHOTO = "photo"
VOICE = "voice"
FILE = "file"

124
parsing/telegram/parsers.py Normal file
View File

@@ -0,0 +1,124 @@
from abc import ABC, abstractmethod
from parsing.telegram.models import TelegramChannelMessage, TelegramChannel, MessageType
import re
class Parser(ABC):
def parse(self, channel, message):
return TelegramChannelMessage(
channel=channel,
link=_TELEGRAM_MESSAGE_LINK.format(channel.channel_id, message.id),
telegram_id=message.id,
grouped_id=message.grouped_id,
timestamp=message.date,
)
@abstractmethod
def matches(self, channel, message):
pass
@staticmethod
def from_message(channel, message):
for parser in _messages_parsers:
if parser.matches(channel, message):
return parser
return None
class SimpleTextParser(Parser):
MARKDOWN_BOLD = ".+\*.+\*.+"
SIMPLE_TEXT_POST = "(.+)\n+(.+)"
def parse(self, channel, message):
parsed_message = super().parse(channel, message)
(title, description) = self.parse_text(message)
parsed_message.title = self.enhance_title(title)
parsed_message.description = description
parsed_message.type = MessageType.TEXT
return parsed_message
def parse_text(self, message):
matcher = re.match(self.SIMPLE_TEXT_POST, message.text)
if matcher is not None:
if len(matcher.groups()) > 1:
title = matcher.group(1)
description = matcher.group(2)
else:
title = matcher.group(1)
description = message.text
else:
title = message.text
description = message.text
return title, description
def enhance_title(self, title):
matcher = re.match(self.MARKDOWN_BOLD, title)
return title.replace("*", "") if matcher is not None else title
def matches(self, channel, message):
return message.text is not None and len(message.text) > 0
class VideoParser(Parser):
def parse(self, channel, message):
parsed_message = super().parse(channel, message)
parsed_message.title = "[video]"
parsed_message.type = MessageType.VIDEO
return parsed_message
def matches(self, channel, message):
return message.video is not None
class PhotoParser(Parser):
def parse(self, channel, message):
parsed_message = super().parse(channel, message)
parsed_message.title = "[photo]"
parsed_message.type = MessageType.PHOTO
return parsed_message
def matches(self, channel, message):
return message.photo is not None
class FileParser(Parser):
def parse(self, channel, message):
parsed_message = super().parse(channel, message)
parsed_message.title = "[file]"
parsed_message.type = MessageType.FILE
return parsed_message
def matches(self, channel, message):
return message.file is not None
class VoiceParser(Parser):
def parse(self, channel, message):
parsed_message = super().parse(channel, message)
parsed_message.title = "[voice]"
parsed_message.type = MessageType.VOICE
return parsed_message
def matches(self, channel, message):
return message.voice is not None
def parse_channel(channel_id, chat_full):
return TelegramChannel(
channel_id=channel_id,
title=chat_full.chats[0].title,
description=chat_full.full_chat.about,
link=_TELEGRAM_CHANNEL_LINK.format(chat_full.chats[0].username),
)
_messages_parsers = [
SimpleTextParser(),
VideoParser(),
PhotoParser(),
VoiceParser(),
FileParser(),
]
_TELEGRAM_CHANNEL_LINK = "https://t.me/{}"
_TELEGRAM_MESSAGE_LINK = _TELEGRAM_CHANNEL_LINK + "/{}"

View File

@@ -0,0 +1,63 @@
from telethon.sync import TelegramClient, functions
from django.conf import settings
from parsing.telegram.parsers import Parser, parse_channel
from parsing.telegram.models import MessageType
import asyncio
def get_channel(channel_id, messages_limit):
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
with TelegramClient(
settings.TELEGRAM_SESSION_FILE,
settings.TELEGRAM_APP_ID,
settings.TELEGRAM_APP_HASH,
loop=loop,
) as client:
channel = parse_channel(
channel_id,
client(functions.channels.GetFullChannelRequest(channel=channel_id)),
)
channel.messages = __get_channel_messages(client, channel, messages_limit)
return channel
def __get_channel_messages(client, channel, messages_limit):
def get_messages_indexes(messages, grouped_id, type=None, inverse=False):
type_predicate = lambda m: m != type if inverse else m == type
indexes = []
for i in range(len(messages)):
message = messages[i]
if type_predicate(message) and message.grouped_id == grouped_id:
indexes.append(i)
return indexes
def merge_messages(messages, new_message):
if new_message.type == MessageType.TEXT:
indexes = get_messages_indexes(
messages, new_message.grouped_id, type=MessageType.TEXT, inverse=True
)
if len(indexes) > 0:
messages[indexes.pop()] = new_message
for i in indexes:
messages.remove(i)
else:
indexes = get_messages_indexes(
messages, new_message.grouped_id, type=MessageType.TEXT
)
if len(indexes) == 0:
messages.append(new_message)
channel_messages = []
for t_message in client.iter_messages(channel.channel_id, limit=messages_limit):
parser = Parser.from_message(channel, t_message)
if parser is not None:
message = parser.parse(channel, t_message)
if message is not None:
if message.grouped_id is not None:
merge_messages(channel_messages, message)
else:
channel_messages.append(message)
return channel_messages

34
parsing/views.py Normal file
View File

@@ -0,0 +1,34 @@
from django.contrib.syndication.views import Feed
from parsing.telegram.telegram import get_channel
class TelegramChannelFeed(Feed):
FEED_ITEMS = 30
def get_object(self, request, channel):
feed_items = (
int(request.GET["size"]) if "size" in request.GET else self.FEED_ITEMS
)
return get_channel(channel, feed_items)
def title(self, obj):
return obj.title
def link(self, obj):
return obj.link
def description(self, obj):
return obj.description
def items(self, obj):
return obj.messages
def item_title(self, item):
return item.title
def item_description(self, item):
return item.description
def item_link(self, item):
return item.link

View File

@@ -11,3 +11,4 @@ sentry-sdk==0.13.5
pyjwt==1.7.1
nltk==3.4.5
newspaper3k>=0.2.8
telethon==1.10.10

11
setup_telegram.py Normal file
View File

@@ -0,0 +1,11 @@
import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "infomate.settings")
import django
django.setup()
from telethon.sync import TelegramClient
from django.conf import settings
with TelegramClient(settings.TELEGRAM_SESSION_FILE, settings.TELEGRAM_APP_ID, settings.TELEGRAM_APP_HASH) as client:
print("Successfully setup Telegram session.")