Add echo msk filter

This commit is contained in:
vas3k
2021-04-29 13:40:55 +03:00
parent 01686236fc
commit ef150e81c0
7 changed files with 48 additions and 6 deletions

View File

@@ -37,6 +37,8 @@ boards:
url: https://echo.msk.ru/
rss: https://echo.msk.ru/news.rss
icon: https://i.vas3k.ru/f9a8212a62b560c42aad54b722f838f3cc10abe30786a0b875230950b8c2dc8e.png
filters:
- echomsk_title_fix
# - name: РБК
# url: https://www.rbc.ru/
# rss: http://static.feed.rbc.ru/rbc/internal/rss.rbc.ru/rbc.ru/news.rss
@@ -67,7 +69,6 @@ boards:
- https://kp.ru/rss/allsections.xml
- https://iz.ru/xml/rss/all.xml
- https://ria.ru/export/rss2/archive/index.xml
- http://static.feed.rbc.ru/rbc/internal/rss.rbc.ru/rbc.ru/news.rss
- name: Телеграм
slug: tg
feeds:

View File

@@ -0,0 +1,19 @@
# Generated by Django 2.2.13 on 2021-04-29 09:47
import django.contrib.postgres.fields.jsonb
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('boards', '0009_auto_20200905_1246'),
]
operations = [
migrations.AddField(
model_name='boardfeed',
name='filters',
field=django.contrib.postgres.fields.jsonb.JSONField(null=True),
),
]

View File

@@ -118,6 +118,7 @@ class BoardFeed(models.Model):
index = models.PositiveIntegerField(default=0)
conditions = JSONField(null=True)
filters = JSONField(null=True)
is_parsable = models.BooleanField(default=True)
class Meta:

View File

@@ -1,12 +1,12 @@
Django==2.2.13
psycopg2==2.8.4
psycopg2-binary==2.8.6
click==7.0
pillow==7.1.0
pillow==8.2.0
awesome-slugify>=1.6.5
requests==2.22.0
beautifulsoup4==4.6.2
pyyaml==5.2
feedparser==5.2.1
feedparser==6
sentry-sdk==0.14.1
nltk==3.4.5
newspaper3k>=0.2.8

10
scripts/filters.py Normal file
View File

@@ -0,0 +1,10 @@
def echomsk_title_fix(entry):
title = entry.get("title")
if len(title) > 20 and title[17] == ":":
entry.title = title[19:]
return entry
FILTERS = {
"echomsk_title_fix": echomsk_title_fix,
}

View File

@@ -103,6 +103,7 @@ def initialize(config, board_slug, upload_favicons, always_yes):
index=feed_index,
columns=feed_config.get("columns") or 1,
conditions=feed_config.get("conditions"),
filters=feed_config.get("filters"),
is_parsable=feed_config.get("is_parsable", True),
view=feed_config.get("view") or BoardFeed.DEFAULT_VIEW,
)

View File

@@ -2,8 +2,6 @@ import io
import os
import sys
import django
from django.db.models import Q
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "infomate.settings")
django.setup()
@@ -20,6 +18,7 @@ from requests import RequestException
from newspaper import Article as NewspaperArticle, ArticleException
from boards.models import BoardFeed, Article, Board
from scripts.filters import FILTERS
from scripts.common import DEFAULT_REQUEST_HEADERS, DEFAULT_REQUEST_TIMEOUT, MAX_PARSABLE_CONTENT_LENGTH, resolve_url, \
parse_domain, parse_datetime, parse_title, parse_link, parse_rss_image, parse_rss_text_and_image
@@ -57,6 +56,7 @@ def update(num_workers, force, feed):
"rss": feed.rss,
"mix": feed.mix,
"conditions": feed.conditions,
"filters": feed.filters,
"is_parsable": feed.is_parsable,
})
@@ -139,6 +139,7 @@ def fetch_rss(item, rss):
print(f"- article: '{entry_title}' {entry_link}")
# check conditions (skip articles if false)
conditions = item.get("conditions")
if conditions:
is_valid = check_conditions(conditions, entry)
@@ -146,11 +147,20 @@ def fetch_rss(item, rss):
print(f"- condition {conditions} does not match. Skipped")
continue
# apply filters (cleanup titles, etc)
filters = item.get("filters")
if filters:
for filter_code in filters:
if FILTERS.get(filter_code):
entry = FILTERS[filter_code](entry)
created_at = parse_datetime(entry)
if created_at <= datetime.utcnow() - DELETE_OLD_ARTICLES_DELTA:
print(f"- article is too old. Skipped")
continue
entry_title = parse_title(entry)
entry_link = parse_link(entry)
article, is_created = Article.objects.get_or_create(
board_id=item["board_id"],
feed_id=item["id"],