Add echo msk filter
This commit is contained in:
@@ -37,6 +37,8 @@ boards:
|
||||
url: https://echo.msk.ru/
|
||||
rss: https://echo.msk.ru/news.rss
|
||||
icon: https://i.vas3k.ru/f9a8212a62b560c42aad54b722f838f3cc10abe30786a0b875230950b8c2dc8e.png
|
||||
filters:
|
||||
- echomsk_title_fix
|
||||
# - name: РБК
|
||||
# url: https://www.rbc.ru/
|
||||
# rss: http://static.feed.rbc.ru/rbc/internal/rss.rbc.ru/rbc.ru/news.rss
|
||||
@@ -67,7 +69,6 @@ boards:
|
||||
- https://kp.ru/rss/allsections.xml
|
||||
- https://iz.ru/xml/rss/all.xml
|
||||
- https://ria.ru/export/rss2/archive/index.xml
|
||||
- http://static.feed.rbc.ru/rbc/internal/rss.rbc.ru/rbc.ru/news.rss
|
||||
- name: Телеграм
|
||||
slug: tg
|
||||
feeds:
|
||||
|
||||
19
boards/migrations/0010_boardfeed_filters.py
Normal file
19
boards/migrations/0010_boardfeed_filters.py
Normal file
@@ -0,0 +1,19 @@
|
||||
# Generated by Django 2.2.13 on 2021-04-29 09:47
|
||||
|
||||
import django.contrib.postgres.fields.jsonb
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('boards', '0009_auto_20200905_1246'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='boardfeed',
|
||||
name='filters',
|
||||
field=django.contrib.postgres.fields.jsonb.JSONField(null=True),
|
||||
),
|
||||
]
|
||||
@@ -118,6 +118,7 @@ class BoardFeed(models.Model):
|
||||
index = models.PositiveIntegerField(default=0)
|
||||
|
||||
conditions = JSONField(null=True)
|
||||
filters = JSONField(null=True)
|
||||
is_parsable = models.BooleanField(default=True)
|
||||
|
||||
class Meta:
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
Django==2.2.13
|
||||
psycopg2==2.8.4
|
||||
psycopg2-binary==2.8.6
|
||||
click==7.0
|
||||
pillow==7.1.0
|
||||
pillow==8.2.0
|
||||
awesome-slugify>=1.6.5
|
||||
requests==2.22.0
|
||||
beautifulsoup4==4.6.2
|
||||
pyyaml==5.2
|
||||
feedparser==5.2.1
|
||||
feedparser==6
|
||||
sentry-sdk==0.14.1
|
||||
nltk==3.4.5
|
||||
newspaper3k>=0.2.8
|
||||
|
||||
10
scripts/filters.py
Normal file
10
scripts/filters.py
Normal file
@@ -0,0 +1,10 @@
|
||||
def echomsk_title_fix(entry):
|
||||
title = entry.get("title")
|
||||
if len(title) > 20 and title[17] == ":":
|
||||
entry.title = title[19:]
|
||||
return entry
|
||||
|
||||
|
||||
FILTERS = {
|
||||
"echomsk_title_fix": echomsk_title_fix,
|
||||
}
|
||||
@@ -103,6 +103,7 @@ def initialize(config, board_slug, upload_favicons, always_yes):
|
||||
index=feed_index,
|
||||
columns=feed_config.get("columns") or 1,
|
||||
conditions=feed_config.get("conditions"),
|
||||
filters=feed_config.get("filters"),
|
||||
is_parsable=feed_config.get("is_parsable", True),
|
||||
view=feed_config.get("view") or BoardFeed.DEFAULT_VIEW,
|
||||
)
|
||||
|
||||
@@ -2,8 +2,6 @@ import io
|
||||
import os
|
||||
import sys
|
||||
import django
|
||||
from django.db.models import Q
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "infomate.settings")
|
||||
django.setup()
|
||||
@@ -20,6 +18,7 @@ from requests import RequestException
|
||||
from newspaper import Article as NewspaperArticle, ArticleException
|
||||
|
||||
from boards.models import BoardFeed, Article, Board
|
||||
from scripts.filters import FILTERS
|
||||
from scripts.common import DEFAULT_REQUEST_HEADERS, DEFAULT_REQUEST_TIMEOUT, MAX_PARSABLE_CONTENT_LENGTH, resolve_url, \
|
||||
parse_domain, parse_datetime, parse_title, parse_link, parse_rss_image, parse_rss_text_and_image
|
||||
|
||||
@@ -57,6 +56,7 @@ def update(num_workers, force, feed):
|
||||
"rss": feed.rss,
|
||||
"mix": feed.mix,
|
||||
"conditions": feed.conditions,
|
||||
"filters": feed.filters,
|
||||
"is_parsable": feed.is_parsable,
|
||||
})
|
||||
|
||||
@@ -139,6 +139,7 @@ def fetch_rss(item, rss):
|
||||
|
||||
print(f"- article: '{entry_title}' {entry_link}")
|
||||
|
||||
# check conditions (skip articles if false)
|
||||
conditions = item.get("conditions")
|
||||
if conditions:
|
||||
is_valid = check_conditions(conditions, entry)
|
||||
@@ -146,11 +147,20 @@ def fetch_rss(item, rss):
|
||||
print(f"- condition {conditions} does not match. Skipped")
|
||||
continue
|
||||
|
||||
# apply filters (cleanup titles, etc)
|
||||
filters = item.get("filters")
|
||||
if filters:
|
||||
for filter_code in filters:
|
||||
if FILTERS.get(filter_code):
|
||||
entry = FILTERS[filter_code](entry)
|
||||
|
||||
created_at = parse_datetime(entry)
|
||||
if created_at <= datetime.utcnow() - DELETE_OLD_ARTICLES_DELTA:
|
||||
print(f"- article is too old. Skipped")
|
||||
continue
|
||||
|
||||
entry_title = parse_title(entry)
|
||||
entry_link = parse_link(entry)
|
||||
article, is_created = Article.objects.get_or_create(
|
||||
board_id=item["board_id"],
|
||||
feed_id=item["id"],
|
||||
|
||||
Reference in New Issue
Block a user