From 91d875d7e98625895d52e9ca1a407487668ac21e Mon Sep 17 00:00:00 2001 From: vas3k Date: Mon, 6 Jan 2020 00:19:05 +0100 Subject: [PATCH] Make article summarizer --- boards.yml | 2 +- boards/migrations/0002_article_summary.py | 18 +++++++++ boards/models.py | 1 + boards/templatetags/text_filters.py | 11 ++++++ infomate/settings.py | 2 +- requirements.txt | 1 + scripts/update.py | 46 +++++++++++++++++------ static/css/components.css | 4 +- templates/board.html | 6 ++- templates/index.html | 5 ++- templates/layout.html | 2 +- 11 files changed, 79 insertions(+), 19 deletions(-) create mode 100644 boards/migrations/0002_article_summary.py diff --git a/boards.yml b/boards.yml index 917ed82..e75d320 100644 --- a/boards.yml +++ b/boards.yml @@ -62,7 +62,7 @@ boards: - name: MIT Technology Review rss: https://www.technologyreview.com/topnews.rss url: https://www.technologyreview.com - - name: Мейкерство + - name: Инди-разработка slug: make feeds: - name: Show HN diff --git a/boards/migrations/0002_article_summary.py b/boards/migrations/0002_article_summary.py new file mode 100644 index 0000000..348ad8b --- /dev/null +++ b/boards/migrations/0002_article_summary.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.8 on 2020-01-05 22:42 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('boards', '0001_initial'), + ] + + operations = [ + migrations.AddField( + model_name='article', + name='summary', + field=models.TextField(null=True), + ), + ] diff --git a/boards/models.py b/boards/models.py index 98483d3..21c825c 100644 --- a/boards/models.py +++ b/boards/models.py @@ -137,6 +137,7 @@ class Article(models.Model): title = models.CharField(max_length=256) image = models.URLField(max_length=512, null=True) description = models.TextField(null=True) + summary = models.TextField(null=True) created_at = models.DateTimeField(db_index=True) updated_at = models.DateTimeField() diff --git a/boards/templatetags/text_filters.py b/boards/templatetags/text_filters.py index 2b9b789..2f3b383 100755 --- a/boards/templatetags/text_filters.py +++ b/boards/templatetags/text_filters.py @@ -15,6 +15,17 @@ def pretty_url(value): return re.sub(r"https?://(www\.)?", "", value, 1) +@register.filter(is_safe=True) +def nl2br(text): + """ + Replaces \n to
+ """ + if not text: + return "" + text = text.replace("\n", "
") + return text + + @register.filter def cool_number(value, num_decimals=1): """ diff --git a/infomate/settings.py b/infomate/settings.py index ecd4a3e..49c44ee 100644 --- a/infomate/settings.py +++ b/infomate/settings.py @@ -88,7 +88,7 @@ CSS_HASH = str(random()) # App settings APP_NAME = "Infomate" -APP_TITLE = "Читай то, что читают другие" +APP_TITLE = "Читай, что читают другие" APP_DESCRIPTION = "" APP_HOST = "https://infomate.club" diff --git a/requirements.txt b/requirements.txt index a9a51ee..0982401 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ pyyaml==5.2 feedparser==5.2.1 sentry-sdk==0.13.5 pyjwt==1.7.1 +newspaper3k>=0.2.8 diff --git a/scripts/update.py b/scripts/update.py index d0c67ba..615b90e 100644 --- a/scripts/update.py +++ b/scripts/update.py @@ -18,6 +18,8 @@ import requests import click import feedparser from bs4 import BeautifulSoup +from requests import RequestException +from newspaper import Article as NewspaperArticle from boards.models import BoardFeed, Article, Board @@ -81,7 +83,10 @@ def worker(): if task is None: break - refresh_feed(task) + try: + refresh_feed(task) + except Exception: + pass # to avoid infinite wait in .join() queue.task_done() @@ -106,21 +111,27 @@ def refresh_feed(item): if is_created: # parse heavy info - try: - real_url = resolve_real_url(entry) + real_url = resolve_real_url(entry) + + if real_url: article.url = real_url[:2000] article.domain = parse_domain(real_url)[:256] - except ConnectionError: - log.warning(f"Failed to resolve real URL: {entry.link}") - summary, lead_image = parse_entry_text_and_image(entry) + text, lead_image = parse_entry_text_and_image(entry) - if summary: - article.description = summary[:1000] + if text: + article.description = text[:1000] if lead_image: article.image = lead_image[:512] + summary, summary_image = load_and_parse_full_article_text_and_image(article.url) + + article.summary = summary + + if summary_image: + article.image = summary_image[:512] + article.save() week_ago = datetime.utcnow() - timedelta(days=7) @@ -139,11 +150,18 @@ def resolve_real_url(entry): depth = 10 while depth > 0: depth -= 1 - r = requests.head(url) - if 300 < r.status_code < 400: - url = r.headers["location"] + + try: + response = requests.head(url) + except RequestException: + log.warning(f"Failed to resolve real URL: {entry.link}") + return None + + if 300 < response.status_code < 400: + url = response.headers["location"] else: break + return url @@ -175,7 +193,11 @@ def parse_entry_text_and_image(entry): def load_and_parse_full_article_text_and_image(url): - pass + article = NewspaperArticle(url) + article.download() + article.parse() + article.nlp() + return article.summary, article.top_image if __name__ == '__main__': diff --git a/static/css/components.css b/static/css/components.css index 2b1a32a..b99adb6 100644 --- a/static/css/components.css +++ b/static/css/components.css @@ -152,7 +152,7 @@ font-size: 180%; text-align: center; border-bottom: solid 2px var(--text-color); - min-height: 40px; + min-height: 45px; } .is-block-header-dummy { @@ -202,7 +202,7 @@ } .feed { - padding: 30px 20px 0; + padding: 35px 20px 0; } .feed-title { diff --git a/templates/board.html b/templates/board.html index c665548..199879b 100644 --- a/templates/board.html +++ b/templates/board.html @@ -62,7 +62,11 @@ {{ article.title|truncatechars:100 }} {% if article.description and article.description|length > 20 %} - {{ article.description|truncatechars:300 }} + {% if article.summary %} + {{ article.summary|striptags|nl2br|truncatechars:300|safe }} + {% else %} + {{ article.description|truncatechars:300 }} + {% endif %} {% endif %} {{ article.natural_created_at }} diff --git a/templates/index.html b/templates/index.html index aed1dfd..368a5ed 100644 --- a/templates/index.html +++ b/templates/index.html @@ -4,7 +4,10 @@ {% block content %}
-

Читай то, что читают другие. Формируй собственное инфополе.

+

+ Читай интернет так, как читают его другие.
+ Формируй собственное инфополе. +

diff --git a/templates/layout.html b/templates/layout.html index f3909cd..15424bf 100644 --- a/templates/layout.html +++ b/templates/layout.html @@ -32,7 +32,7 @@ {% block footer %}