New telegram parser, removed telethon

This commit is contained in:
vas3k
2020-12-22 17:13:43 +01:00
parent dd854e395e
commit 2b9c95c49d
9 changed files with 91 additions and 294 deletions

View File

@@ -13,7 +13,8 @@ urlpatterns = [
path("<slug:board_slug>/", board, name="board"),
path("parsing/telegram/<str:channel>/",
path("parsing/telegram/<str:channel_name>/",
cache_page(settings.TELEGRAM_CACHE_SECONDS)(TelegramChannelFeed()),
name="telegram_channel_feed")
name="telegram_channel_feed"),
]

View File

@@ -1,16 +0,0 @@
import re
def cleanup_telegram_message_text(text):
text = str(text)
cleanup_expressions = [
r"\[.*?\]\(.*?\)", # attached files and images
r"#[\S]*", # hashtags
r"[\*\~\`]+", # stars and tildas
r"<[^>]*>", # html tags
r"[\n]{3,}", # triple newlines
]
for pattern in cleanup_expressions:
text = re.sub(pattern, "", text, flags=re.M)
return text.strip()

View File

@@ -1,49 +0,0 @@
from enum import Enum
class TelegramChannel:
def __init__(
self,
channel_id=None,
title=None,
link=None,
description=None,
messages=None
):
self.channel_id = channel_id
self.title = title
self.link = link
self.description = description
self.messages = messages or []
class TelegramChannelMessage:
def __init__(
self,
telegram_id=None,
title=None,
text=None,
clean_text=None,
link=None,
channel=None,
grouped_id=None,
type=None,
timestamp=None,
):
self.telegram_id = telegram_id
self.title = title
self.text = text
self.clean_text = clean_text
self.link = link
self.channel = channel
self.grouped_id = grouped_id
self.type = type
self.timestamp = timestamp
class MessageType(Enum):
TEXT = "text"
VIDEO = "video"
PHOTO = "photo"
VOICE = "voice"
FILE = "file"

View File

@@ -0,0 +1,70 @@
import re
from collections import namedtuple
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from scripts.common import DEFAULT_REQUEST_TIMEOUT, DEFAULT_REQUEST_HEADERS
TELEGRAM_CHANNEL_WEBVIEW_PREFIX = "https://t.me/s/"
BACKGROUND_IMAGE_RE = re.compile("url\('(https://.+?)'\)")
TELEGRAM_MESSAGE_CLASS = ".tgme_widget_message"
TELEGRAM_MESSAGE_TEXT_CLASS = ".tgme_widget_message_text"
TELEGRAM_MESSAGE_PHOTO_CLASS = ".tgme_widget_message_photo_wrap"
TELEGRAM_MESSAGE_DATE_CLASS = ".tgme_widget_message_date"
TelegramChannel = namedtuple("TelegramChannel", ["url", "name", "messages"])
TelegramMessage = namedtuple("TelegramMessage", ["url", "text", "photo", "created_at"])
def parse_channel(channel_name, only_text=False, limit=100) -> TelegramChannel:
channel_url = TELEGRAM_CHANNEL_WEBVIEW_PREFIX + channel_name
response = requests.get(
url=channel_url,
timeout=DEFAULT_REQUEST_TIMEOUT,
headers=DEFAULT_REQUEST_HEADERS,
)
bs = BeautifulSoup(response.text, features="lxml")
messages = []
message_tags = bs.select(TELEGRAM_MESSAGE_CLASS)
for message_tag in message_tags:
message_text = None
message_text_tag = message_tag.select(TELEGRAM_MESSAGE_TEXT_CLASS)
if message_text_tag:
message_text = str(message_text_tag[0])
message_photo = None
message_photo_tag = message_tag.select(TELEGRAM_MESSAGE_PHOTO_CLASS)
if message_photo_tag:
message_photo = BACKGROUND_IMAGE_RE.search(str(message_photo_tag[0])).group(1)
message_url = None
message_time = datetime.utcnow()
message_date_tag = message_tag.select(TELEGRAM_MESSAGE_DATE_CLASS)
if message_date_tag:
message_url = message_date_tag[0]["href"]
message_datetime_tag = message_date_tag[0].select("time")
if message_datetime_tag:
message_time = datetime.fromisoformat(message_datetime_tag[0]["datetime"])
messages.append(
TelegramMessage(
url=message_url,
text=message_text,
photo=message_photo,
created_at=message_time,
)
)
if only_text:
messages = [m for m in messages if m.text]
return TelegramChannel(
url=channel_url,
name=channel_name,
messages=list(reversed(messages))[:limit],
)

View File

@@ -1,110 +0,0 @@
from abc import ABC, abstractmethod
from parsing.telegram.cleanup import cleanup_telegram_message_text
from parsing.telegram.models import TelegramChannelMessage, TelegramChannel, MessageType
class Parser(ABC):
def parse(self, channel, message):
return TelegramChannelMessage(
channel=channel,
link=_TELEGRAM_MESSAGE_LINK.format(channel.channel_id, message.id),
telegram_id=message.id,
grouped_id=message.grouped_id,
timestamp=message.date,
)
@abstractmethod
def matches(self, channel, message):
pass
@staticmethod
def from_message(channel, message):
for parser in _messages_parsers:
if parser.matches(channel, message):
return parser
return None
class SimpleTextParser(Parser):
def parse(self, channel, message):
parsed_message = super().parse(channel, message)
parsed_message.text = message.text
parsed_message.clean_text = cleanup_telegram_message_text(message.text)
parsed_message.title = self.parse_message_title(parsed_message.clean_text)
parsed_message.type = MessageType.TEXT
return parsed_message
@classmethod
def parse_message_title(cls, text):
if text:
return text.split("\n", 1)[0]
return ""
def matches(self, channel, message):
return message.text is not None and len(message.text) > 0
class VideoParser(Parser):
def parse(self, channel, message):
parsed_message = super().parse(channel, message)
parsed_message.title = "[video]"
parsed_message.type = MessageType.VIDEO
return parsed_message
def matches(self, channel, message):
return message.video is not None
class PhotoParser(Parser):
def parse(self, channel, message):
parsed_message = super().parse(channel, message)
parsed_message.title = "[photo]"
parsed_message.type = MessageType.PHOTO
return parsed_message
def matches(self, channel, message):
return message.photo is not None
class FileParser(Parser):
def parse(self, channel, message):
parsed_message = super().parse(channel, message)
parsed_message.title = "[file]"
parsed_message.type = MessageType.FILE
return parsed_message
def matches(self, channel, message):
return message.file is not None
class VoiceParser(Parser):
def parse(self, channel, message):
parsed_message = super().parse(channel, message)
parsed_message.title = "[voice]"
parsed_message.type = MessageType.VOICE
return parsed_message
def matches(self, channel, message):
return message.voice is not None
def parse_channel(channel_id, chat_full):
return TelegramChannel(
channel_id=channel_id,
title=chat_full.chats[0].title,
description=chat_full.full_chat.about,
link=_TELEGRAM_CHANNEL_LINK.format(chat_full.chats[0].username),
)
_messages_parsers = [
SimpleTextParser(),
VideoParser(),
PhotoParser(),
VoiceParser(),
FileParser(),
]
_TELEGRAM_CHANNEL_LINK = "https://t.me/{}"
_TELEGRAM_MESSAGE_LINK = _TELEGRAM_CHANNEL_LINK + "/{}"

View File

@@ -1,78 +0,0 @@
from telethon.sync import TelegramClient, functions
from django.conf import settings
from parsing.exceptions import ParsingException
from parsing.telegram.parsers import Parser, parse_channel
from parsing.telegram.models import MessageType
import asyncio
DEFAULT_LIMIT = 30
def get_channel(channel_id, *, types=None, limit=DEFAULT_LIMIT):
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
with TelegramClient(
settings.TELEGRAM_SESSION_FILE,
settings.TELEGRAM_APP_ID,
settings.TELEGRAM_APP_HASH,
loop=loop,
) as client:
try:
channel = parse_channel(
channel_id,
client(functions.channels.GetFullChannelRequest(channel=channel_id)),
)
except ValueError:
raise ParsingException(f"No channel named '{channel_id}'")
channel.messages = get_channel_messages(client, channel, types=types, limit=limit)
return channel
def get_channel_messages(client, channel, *, types=None, limit=DEFAULT_LIMIT):
def get_messages_indexes(messages, grouped_id, type=None, inverse=False):
type_predicate = lambda m: m != type if inverse else m == type
indexes = []
for i, message in enumerate(messages):
if type_predicate(message) and message.grouped_id == grouped_id:
indexes.append(i)
return indexes
def merge_messages(messages, new_message):
if new_message.type == MessageType.TEXT:
indexes = get_messages_indexes(
messages, new_message.grouped_id, type=MessageType.TEXT, inverse=True
)
if len(indexes) > 0:
messages[indexes.pop()] = new_message
for i in indexes:
try:
messages.remove(i)
except ValueError:
pass # skip missing messages
else:
indexes = get_messages_indexes(
messages, new_message.grouped_id, type=MessageType.TEXT
)
if len(indexes) == 0:
messages.append(new_message)
channel_messages = []
for t_message in client.iter_messages(channel.channel_id, limit=limit):
parser = Parser.from_message(channel, t_message)
if parser is not None:
message = parser.parse(channel, t_message)
if message is not None:
if types and message.type not in types:
continue
if message.grouped_id is not None:
merge_messages(channel_messages, message)
else:
channel_messages.append(message)
return channel_messages

View File

@@ -1,47 +1,38 @@
from django.contrib.syndication.views import Feed
from django.http import Http404, HttpResponseBadRequest
from parsing.exceptions import ParsingException
from parsing.telegram.telegram import get_channel
from parsing.telegram.models import MessageType
from parsing.telegram.parser import parse_channel
class TelegramChannelFeed(Feed):
FEED_ITEMS = 30
def get_object(self, request, channel):
def get_object(self, request, channel_name):
limit = int(request.GET.get("size") or self.FEED_ITEMS)
only = str(request.GET.get("only") or "")
if only:
try:
only = [MessageType(item.strip()) for item in only.split(",")]
except (KeyError, ValueError):
return HttpResponseBadRequest()
limit = 100 # dirty hack: artificially increase the limit to get more messages after filtering
try:
return get_channel(channel, types=only, limit=limit)
except ParsingException:
raise Http404()
return parse_channel(channel_name, only_text=only == "text", limit=limit)
def title(self, obj):
return obj.title
def link(self, obj):
return obj.link
def description(self, obj):
return obj.description
return obj.name
def items(self, obj):
return obj.messages
def link(self, obj):
return obj.url
def item_title(self, item):
return item.title
return item.text
def item_description(self, item):
return item.clean_text
result = ""
if item.photo:
result += f"<img src=\"{item.photo}\"><br>"
if item.text:
result += str(item.text)
return result
def item_link(self, item):
return item.link
return item.url
def item_pubdate(self, item):
return item.created_at

View File

@@ -10,5 +10,4 @@ feedparser==5.2.1
sentry-sdk==0.14.1
nltk==3.4.5
newspaper3k>=0.2.8
telethon==1.18.2
django-bleach==0.6.1

View File

@@ -1,11 +0,0 @@
import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "infomate.settings")
import django
django.setup()
from telethon.sync import TelegramClient
from django.conf import settings
with TelegramClient(settings.TELEGRAM_SESSION_FILE, settings.TELEGRAM_APP_ID, settings.TELEGRAM_APP_HASH) as client:
print("Successfully setup Telegram session.")