New telegram parser, removed telethon
This commit is contained in:
@@ -13,7 +13,8 @@ urlpatterns = [
|
||||
|
||||
path("<slug:board_slug>/", board, name="board"),
|
||||
|
||||
path("parsing/telegram/<str:channel>/",
|
||||
path("parsing/telegram/<str:channel_name>/",
|
||||
cache_page(settings.TELEGRAM_CACHE_SECONDS)(TelegramChannelFeed()),
|
||||
name="telegram_channel_feed")
|
||||
name="telegram_channel_feed"),
|
||||
|
||||
]
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
import re
|
||||
|
||||
|
||||
def cleanup_telegram_message_text(text):
|
||||
text = str(text)
|
||||
cleanup_expressions = [
|
||||
r"\[.*?\]\(.*?\)", # attached files and images
|
||||
r"#[\S]*", # hashtags
|
||||
r"[\*\~\`]+", # stars and tildas
|
||||
r"<[^>]*>", # html tags
|
||||
r"[\n]{3,}", # triple newlines
|
||||
]
|
||||
for pattern in cleanup_expressions:
|
||||
text = re.sub(pattern, "", text, flags=re.M)
|
||||
|
||||
return text.strip()
|
||||
@@ -1,49 +0,0 @@
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class TelegramChannel:
|
||||
def __init__(
|
||||
self,
|
||||
channel_id=None,
|
||||
title=None,
|
||||
link=None,
|
||||
description=None,
|
||||
messages=None
|
||||
):
|
||||
self.channel_id = channel_id
|
||||
self.title = title
|
||||
self.link = link
|
||||
self.description = description
|
||||
self.messages = messages or []
|
||||
|
||||
|
||||
class TelegramChannelMessage:
|
||||
def __init__(
|
||||
self,
|
||||
telegram_id=None,
|
||||
title=None,
|
||||
text=None,
|
||||
clean_text=None,
|
||||
link=None,
|
||||
channel=None,
|
||||
grouped_id=None,
|
||||
type=None,
|
||||
timestamp=None,
|
||||
):
|
||||
self.telegram_id = telegram_id
|
||||
self.title = title
|
||||
self.text = text
|
||||
self.clean_text = clean_text
|
||||
self.link = link
|
||||
self.channel = channel
|
||||
self.grouped_id = grouped_id
|
||||
self.type = type
|
||||
self.timestamp = timestamp
|
||||
|
||||
|
||||
class MessageType(Enum):
|
||||
TEXT = "text"
|
||||
VIDEO = "video"
|
||||
PHOTO = "photo"
|
||||
VOICE = "voice"
|
||||
FILE = "file"
|
||||
70
parsing/telegram/parser.py
Normal file
70
parsing/telegram/parser.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import re
|
||||
from collections import namedtuple
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from scripts.common import DEFAULT_REQUEST_TIMEOUT, DEFAULT_REQUEST_HEADERS
|
||||
|
||||
TELEGRAM_CHANNEL_WEBVIEW_PREFIX = "https://t.me/s/"
|
||||
BACKGROUND_IMAGE_RE = re.compile("url\('(https://.+?)'\)")
|
||||
|
||||
TELEGRAM_MESSAGE_CLASS = ".tgme_widget_message"
|
||||
TELEGRAM_MESSAGE_TEXT_CLASS = ".tgme_widget_message_text"
|
||||
TELEGRAM_MESSAGE_PHOTO_CLASS = ".tgme_widget_message_photo_wrap"
|
||||
TELEGRAM_MESSAGE_DATE_CLASS = ".tgme_widget_message_date"
|
||||
|
||||
TelegramChannel = namedtuple("TelegramChannel", ["url", "name", "messages"])
|
||||
TelegramMessage = namedtuple("TelegramMessage", ["url", "text", "photo", "created_at"])
|
||||
|
||||
|
||||
def parse_channel(channel_name, only_text=False, limit=100) -> TelegramChannel:
|
||||
channel_url = TELEGRAM_CHANNEL_WEBVIEW_PREFIX + channel_name
|
||||
response = requests.get(
|
||||
url=channel_url,
|
||||
timeout=DEFAULT_REQUEST_TIMEOUT,
|
||||
headers=DEFAULT_REQUEST_HEADERS,
|
||||
)
|
||||
|
||||
bs = BeautifulSoup(response.text, features="lxml")
|
||||
|
||||
messages = []
|
||||
message_tags = bs.select(TELEGRAM_MESSAGE_CLASS)
|
||||
for message_tag in message_tags:
|
||||
message_text = None
|
||||
message_text_tag = message_tag.select(TELEGRAM_MESSAGE_TEXT_CLASS)
|
||||
if message_text_tag:
|
||||
message_text = str(message_text_tag[0])
|
||||
|
||||
message_photo = None
|
||||
message_photo_tag = message_tag.select(TELEGRAM_MESSAGE_PHOTO_CLASS)
|
||||
if message_photo_tag:
|
||||
message_photo = BACKGROUND_IMAGE_RE.search(str(message_photo_tag[0])).group(1)
|
||||
|
||||
message_url = None
|
||||
message_time = datetime.utcnow()
|
||||
message_date_tag = message_tag.select(TELEGRAM_MESSAGE_DATE_CLASS)
|
||||
if message_date_tag:
|
||||
message_url = message_date_tag[0]["href"]
|
||||
message_datetime_tag = message_date_tag[0].select("time")
|
||||
if message_datetime_tag:
|
||||
message_time = datetime.fromisoformat(message_datetime_tag[0]["datetime"])
|
||||
|
||||
messages.append(
|
||||
TelegramMessage(
|
||||
url=message_url,
|
||||
text=message_text,
|
||||
photo=message_photo,
|
||||
created_at=message_time,
|
||||
)
|
||||
)
|
||||
|
||||
if only_text:
|
||||
messages = [m for m in messages if m.text]
|
||||
|
||||
return TelegramChannel(
|
||||
url=channel_url,
|
||||
name=channel_name,
|
||||
messages=list(reversed(messages))[:limit],
|
||||
)
|
||||
@@ -1,110 +0,0 @@
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from parsing.telegram.cleanup import cleanup_telegram_message_text
|
||||
from parsing.telegram.models import TelegramChannelMessage, TelegramChannel, MessageType
|
||||
|
||||
|
||||
class Parser(ABC):
|
||||
def parse(self, channel, message):
|
||||
return TelegramChannelMessage(
|
||||
channel=channel,
|
||||
link=_TELEGRAM_MESSAGE_LINK.format(channel.channel_id, message.id),
|
||||
telegram_id=message.id,
|
||||
grouped_id=message.grouped_id,
|
||||
timestamp=message.date,
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def matches(self, channel, message):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def from_message(channel, message):
|
||||
for parser in _messages_parsers:
|
||||
if parser.matches(channel, message):
|
||||
return parser
|
||||
return None
|
||||
|
||||
|
||||
class SimpleTextParser(Parser):
|
||||
def parse(self, channel, message):
|
||||
parsed_message = super().parse(channel, message)
|
||||
parsed_message.text = message.text
|
||||
parsed_message.clean_text = cleanup_telegram_message_text(message.text)
|
||||
parsed_message.title = self.parse_message_title(parsed_message.clean_text)
|
||||
parsed_message.type = MessageType.TEXT
|
||||
return parsed_message
|
||||
|
||||
@classmethod
|
||||
def parse_message_title(cls, text):
|
||||
if text:
|
||||
return text.split("\n", 1)[0]
|
||||
return ""
|
||||
|
||||
def matches(self, channel, message):
|
||||
return message.text is not None and len(message.text) > 0
|
||||
|
||||
|
||||
class VideoParser(Parser):
|
||||
def parse(self, channel, message):
|
||||
parsed_message = super().parse(channel, message)
|
||||
parsed_message.title = "[video]"
|
||||
parsed_message.type = MessageType.VIDEO
|
||||
return parsed_message
|
||||
|
||||
def matches(self, channel, message):
|
||||
return message.video is not None
|
||||
|
||||
|
||||
class PhotoParser(Parser):
|
||||
def parse(self, channel, message):
|
||||
parsed_message = super().parse(channel, message)
|
||||
parsed_message.title = "[photo]"
|
||||
parsed_message.type = MessageType.PHOTO
|
||||
return parsed_message
|
||||
|
||||
def matches(self, channel, message):
|
||||
return message.photo is not None
|
||||
|
||||
|
||||
class FileParser(Parser):
|
||||
def parse(self, channel, message):
|
||||
parsed_message = super().parse(channel, message)
|
||||
parsed_message.title = "[file]"
|
||||
parsed_message.type = MessageType.FILE
|
||||
return parsed_message
|
||||
|
||||
def matches(self, channel, message):
|
||||
return message.file is not None
|
||||
|
||||
|
||||
class VoiceParser(Parser):
|
||||
def parse(self, channel, message):
|
||||
parsed_message = super().parse(channel, message)
|
||||
parsed_message.title = "[voice]"
|
||||
parsed_message.type = MessageType.VOICE
|
||||
return parsed_message
|
||||
|
||||
def matches(self, channel, message):
|
||||
return message.voice is not None
|
||||
|
||||
|
||||
def parse_channel(channel_id, chat_full):
|
||||
return TelegramChannel(
|
||||
channel_id=channel_id,
|
||||
title=chat_full.chats[0].title,
|
||||
description=chat_full.full_chat.about,
|
||||
link=_TELEGRAM_CHANNEL_LINK.format(chat_full.chats[0].username),
|
||||
)
|
||||
|
||||
|
||||
_messages_parsers = [
|
||||
SimpleTextParser(),
|
||||
VideoParser(),
|
||||
PhotoParser(),
|
||||
VoiceParser(),
|
||||
FileParser(),
|
||||
]
|
||||
|
||||
_TELEGRAM_CHANNEL_LINK = "https://t.me/{}"
|
||||
_TELEGRAM_MESSAGE_LINK = _TELEGRAM_CHANNEL_LINK + "/{}"
|
||||
@@ -1,78 +0,0 @@
|
||||
from telethon.sync import TelegramClient, functions
|
||||
from django.conf import settings
|
||||
|
||||
from parsing.exceptions import ParsingException
|
||||
from parsing.telegram.parsers import Parser, parse_channel
|
||||
from parsing.telegram.models import MessageType
|
||||
import asyncio
|
||||
|
||||
DEFAULT_LIMIT = 30
|
||||
|
||||
|
||||
def get_channel(channel_id, *, types=None, limit=DEFAULT_LIMIT):
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
with TelegramClient(
|
||||
settings.TELEGRAM_SESSION_FILE,
|
||||
settings.TELEGRAM_APP_ID,
|
||||
settings.TELEGRAM_APP_HASH,
|
||||
loop=loop,
|
||||
) as client:
|
||||
try:
|
||||
channel = parse_channel(
|
||||
channel_id,
|
||||
client(functions.channels.GetFullChannelRequest(channel=channel_id)),
|
||||
)
|
||||
except ValueError:
|
||||
raise ParsingException(f"No channel named '{channel_id}'")
|
||||
|
||||
channel.messages = get_channel_messages(client, channel, types=types, limit=limit)
|
||||
|
||||
return channel
|
||||
|
||||
|
||||
def get_channel_messages(client, channel, *, types=None, limit=DEFAULT_LIMIT):
|
||||
def get_messages_indexes(messages, grouped_id, type=None, inverse=False):
|
||||
type_predicate = lambda m: m != type if inverse else m == type
|
||||
indexes = []
|
||||
for i, message in enumerate(messages):
|
||||
if type_predicate(message) and message.grouped_id == grouped_id:
|
||||
indexes.append(i)
|
||||
return indexes
|
||||
|
||||
def merge_messages(messages, new_message):
|
||||
if new_message.type == MessageType.TEXT:
|
||||
indexes = get_messages_indexes(
|
||||
messages, new_message.grouped_id, type=MessageType.TEXT, inverse=True
|
||||
)
|
||||
if len(indexes) > 0:
|
||||
messages[indexes.pop()] = new_message
|
||||
|
||||
for i in indexes:
|
||||
try:
|
||||
messages.remove(i)
|
||||
except ValueError:
|
||||
pass # skip missing messages
|
||||
else:
|
||||
indexes = get_messages_indexes(
|
||||
messages, new_message.grouped_id, type=MessageType.TEXT
|
||||
)
|
||||
if len(indexes) == 0:
|
||||
messages.append(new_message)
|
||||
|
||||
channel_messages = []
|
||||
|
||||
for t_message in client.iter_messages(channel.channel_id, limit=limit):
|
||||
parser = Parser.from_message(channel, t_message)
|
||||
if parser is not None:
|
||||
message = parser.parse(channel, t_message)
|
||||
if message is not None:
|
||||
if types and message.type not in types:
|
||||
continue
|
||||
|
||||
if message.grouped_id is not None:
|
||||
merge_messages(channel_messages, message)
|
||||
else:
|
||||
channel_messages.append(message)
|
||||
|
||||
return channel_messages
|
||||
@@ -1,47 +1,38 @@
|
||||
from django.contrib.syndication.views import Feed
|
||||
from django.http import Http404, HttpResponseBadRequest
|
||||
|
||||
from parsing.exceptions import ParsingException
|
||||
from parsing.telegram.telegram import get_channel
|
||||
from parsing.telegram.models import MessageType
|
||||
from parsing.telegram.parser import parse_channel
|
||||
|
||||
|
||||
class TelegramChannelFeed(Feed):
|
||||
FEED_ITEMS = 30
|
||||
|
||||
def get_object(self, request, channel):
|
||||
def get_object(self, request, channel_name):
|
||||
limit = int(request.GET.get("size") or self.FEED_ITEMS)
|
||||
only = str(request.GET.get("only") or "")
|
||||
if only:
|
||||
try:
|
||||
only = [MessageType(item.strip()) for item in only.split(",")]
|
||||
except (KeyError, ValueError):
|
||||
return HttpResponseBadRequest()
|
||||
|
||||
limit = 100 # dirty hack: artificially increase the limit to get more messages after filtering
|
||||
|
||||
try:
|
||||
return get_channel(channel, types=only, limit=limit)
|
||||
except ParsingException:
|
||||
raise Http404()
|
||||
return parse_channel(channel_name, only_text=only == "text", limit=limit)
|
||||
|
||||
def title(self, obj):
|
||||
return obj.title
|
||||
|
||||
def link(self, obj):
|
||||
return obj.link
|
||||
|
||||
def description(self, obj):
|
||||
return obj.description
|
||||
return obj.name
|
||||
|
||||
def items(self, obj):
|
||||
return obj.messages
|
||||
|
||||
def link(self, obj):
|
||||
return obj.url
|
||||
|
||||
def item_title(self, item):
|
||||
return item.title
|
||||
return item.text
|
||||
|
||||
def item_description(self, item):
|
||||
return item.clean_text
|
||||
result = ""
|
||||
if item.photo:
|
||||
result += f"<img src=\"{item.photo}\"><br>"
|
||||
if item.text:
|
||||
result += str(item.text)
|
||||
return result
|
||||
|
||||
def item_link(self, item):
|
||||
return item.link
|
||||
return item.url
|
||||
|
||||
def item_pubdate(self, item):
|
||||
return item.created_at
|
||||
|
||||
@@ -10,5 +10,4 @@ feedparser==5.2.1
|
||||
sentry-sdk==0.14.1
|
||||
nltk==3.4.5
|
||||
newspaper3k>=0.2.8
|
||||
telethon==1.18.2
|
||||
django-bleach==0.6.1
|
||||
@@ -1,11 +0,0 @@
|
||||
import os
|
||||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "infomate.settings")
|
||||
import django
|
||||
django.setup()
|
||||
|
||||
from telethon.sync import TelegramClient
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
with TelegramClient(settings.TELEGRAM_SESSION_FILE, settings.TELEGRAM_APP_ID, settings.TELEGRAM_APP_HASH) as client:
|
||||
print("Successfully setup Telegram session.")
|
||||
Reference in New Issue
Block a user