diff --git a/boards.yml b/boards.yml index aa275d4..51f4a69 100644 --- a/boards.yml +++ b/boards.yml @@ -37,6 +37,8 @@ boards: url: https://echo.msk.ru/ rss: https://echo.msk.ru/news.rss icon: https://i.vas3k.ru/f9a8212a62b560c42aad54b722f838f3cc10abe30786a0b875230950b8c2dc8e.png + filters: + - echomsk_title_fix # - name: РБК # url: https://www.rbc.ru/ # rss: http://static.feed.rbc.ru/rbc/internal/rss.rbc.ru/rbc.ru/news.rss @@ -67,7 +69,6 @@ boards: - https://kp.ru/rss/allsections.xml - https://iz.ru/xml/rss/all.xml - https://ria.ru/export/rss2/archive/index.xml - - http://static.feed.rbc.ru/rbc/internal/rss.rbc.ru/rbc.ru/news.rss - name: Телеграм slug: tg feeds: diff --git a/boards/migrations/0010_boardfeed_filters.py b/boards/migrations/0010_boardfeed_filters.py new file mode 100644 index 0000000..a0d6735 --- /dev/null +++ b/boards/migrations/0010_boardfeed_filters.py @@ -0,0 +1,19 @@ +# Generated by Django 2.2.13 on 2021-04-29 09:47 + +import django.contrib.postgres.fields.jsonb +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('boards', '0009_auto_20200905_1246'), + ] + + operations = [ + migrations.AddField( + model_name='boardfeed', + name='filters', + field=django.contrib.postgres.fields.jsonb.JSONField(null=True), + ), + ] diff --git a/boards/models.py b/boards/models.py index f335cba..95bd9fa 100644 --- a/boards/models.py +++ b/boards/models.py @@ -118,6 +118,7 @@ class BoardFeed(models.Model): index = models.PositiveIntegerField(default=0) conditions = JSONField(null=True) + filters = JSONField(null=True) is_parsable = models.BooleanField(default=True) class Meta: diff --git a/requirements.txt b/requirements.txt index 154f8a8..7f623ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,12 @@ Django==2.2.13 -psycopg2==2.8.4 +psycopg2-binary==2.8.6 click==7.0 -pillow==7.1.0 +pillow==8.2.0 awesome-slugify>=1.6.5 requests==2.22.0 beautifulsoup4==4.6.2 pyyaml==5.2 -feedparser==5.2.1 +feedparser==6 sentry-sdk==0.14.1 nltk==3.4.5 newspaper3k>=0.2.8 diff --git a/scripts/filters.py b/scripts/filters.py new file mode 100644 index 0000000..9ea0fa5 --- /dev/null +++ b/scripts/filters.py @@ -0,0 +1,10 @@ +def echomsk_title_fix(entry): + title = entry.get("title") + if len(title) > 20 and title[17] == ":": + entry.title = title[19:] + return entry + + +FILTERS = { + "echomsk_title_fix": echomsk_title_fix, +} diff --git a/scripts/initialize.py b/scripts/initialize.py index e4ddc6d..0536040 100644 --- a/scripts/initialize.py +++ b/scripts/initialize.py @@ -103,6 +103,7 @@ def initialize(config, board_slug, upload_favicons, always_yes): index=feed_index, columns=feed_config.get("columns") or 1, conditions=feed_config.get("conditions"), + filters=feed_config.get("filters"), is_parsable=feed_config.get("is_parsable", True), view=feed_config.get("view") or BoardFeed.DEFAULT_VIEW, ) diff --git a/scripts/update.py b/scripts/update.py index af75f30..b147160 100644 --- a/scripts/update.py +++ b/scripts/update.py @@ -2,8 +2,6 @@ import io import os import sys import django -from django.db.models import Q - sys.path.append(os.path.join(os.path.dirname(__file__), "..")) os.environ.setdefault("DJANGO_SETTINGS_MODULE", "infomate.settings") django.setup() @@ -20,6 +18,7 @@ from requests import RequestException from newspaper import Article as NewspaperArticle, ArticleException from boards.models import BoardFeed, Article, Board +from scripts.filters import FILTERS from scripts.common import DEFAULT_REQUEST_HEADERS, DEFAULT_REQUEST_TIMEOUT, MAX_PARSABLE_CONTENT_LENGTH, resolve_url, \ parse_domain, parse_datetime, parse_title, parse_link, parse_rss_image, parse_rss_text_and_image @@ -57,6 +56,7 @@ def update(num_workers, force, feed): "rss": feed.rss, "mix": feed.mix, "conditions": feed.conditions, + "filters": feed.filters, "is_parsable": feed.is_parsable, }) @@ -139,6 +139,7 @@ def fetch_rss(item, rss): print(f"- article: '{entry_title}' {entry_link}") + # check conditions (skip articles if false) conditions = item.get("conditions") if conditions: is_valid = check_conditions(conditions, entry) @@ -146,11 +147,20 @@ def fetch_rss(item, rss): print(f"- condition {conditions} does not match. Skipped") continue + # apply filters (cleanup titles, etc) + filters = item.get("filters") + if filters: + for filter_code in filters: + if FILTERS.get(filter_code): + entry = FILTERS[filter_code](entry) + created_at = parse_datetime(entry) if created_at <= datetime.utcnow() - DELETE_OLD_ARTICLES_DELTA: print(f"- article is too old. Skipped") continue + entry_title = parse_title(entry) + entry_link = parse_link(entry) article, is_created = Article.objects.get_or_create( board_id=item["board_id"], feed_id=item["id"],