Make article summarizer

This commit is contained in:
vas3k
2020-01-06 00:19:05 +01:00
parent 69657de858
commit 91d875d7e9
11 changed files with 79 additions and 19 deletions

View File

@@ -62,7 +62,7 @@ boards:
- name: MIT Technology Review
rss: https://www.technologyreview.com/topnews.rss
url: https://www.technologyreview.com
- name: Мейкерство
- name: Инди-разработка
slug: make
feeds:
- name: Show HN

View File

@@ -0,0 +1,18 @@
# Generated by Django 2.2.8 on 2020-01-05 22:42
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('boards', '0001_initial'),
]
operations = [
migrations.AddField(
model_name='article',
name='summary',
field=models.TextField(null=True),
),
]

View File

@@ -137,6 +137,7 @@ class Article(models.Model):
title = models.CharField(max_length=256)
image = models.URLField(max_length=512, null=True)
description = models.TextField(null=True)
summary = models.TextField(null=True)
created_at = models.DateTimeField(db_index=True)
updated_at = models.DateTimeField()

View File

@@ -15,6 +15,17 @@ def pretty_url(value):
return re.sub(r"https?://(www\.)?", "", value, 1)
@register.filter(is_safe=True)
def nl2br(text):
"""
Replaces \n to <br/>
"""
if not text:
return ""
text = text.replace("\n", "<br/>")
return text
@register.filter
def cool_number(value, num_decimals=1):
"""

View File

@@ -88,7 +88,7 @@ CSS_HASH = str(random())
# App settings
APP_NAME = "Infomate"
APP_TITLE = "Читай то, что читают другие"
APP_TITLE = "Читай, что читают другие"
APP_DESCRIPTION = ""
APP_HOST = "https://infomate.club"

View File

@@ -7,3 +7,4 @@ pyyaml==5.2
feedparser==5.2.1
sentry-sdk==0.13.5
pyjwt==1.7.1
newspaper3k>=0.2.8

View File

@@ -18,6 +18,8 @@ import requests
import click
import feedparser
from bs4 import BeautifulSoup
from requests import RequestException
from newspaper import Article as NewspaperArticle
from boards.models import BoardFeed, Article, Board
@@ -81,7 +83,10 @@ def worker():
if task is None:
break
refresh_feed(task)
try:
refresh_feed(task)
except Exception:
pass # to avoid infinite wait in .join()
queue.task_done()
@@ -106,21 +111,27 @@ def refresh_feed(item):
if is_created:
# parse heavy info
try:
real_url = resolve_real_url(entry)
real_url = resolve_real_url(entry)
if real_url:
article.url = real_url[:2000]
article.domain = parse_domain(real_url)[:256]
except ConnectionError:
log.warning(f"Failed to resolve real URL: {entry.link}")
summary, lead_image = parse_entry_text_and_image(entry)
text, lead_image = parse_entry_text_and_image(entry)
if summary:
article.description = summary[:1000]
if text:
article.description = text[:1000]
if lead_image:
article.image = lead_image[:512]
summary, summary_image = load_and_parse_full_article_text_and_image(article.url)
article.summary = summary
if summary_image:
article.image = summary_image[:512]
article.save()
week_ago = datetime.utcnow() - timedelta(days=7)
@@ -139,11 +150,18 @@ def resolve_real_url(entry):
depth = 10
while depth > 0:
depth -= 1
r = requests.head(url)
if 300 < r.status_code < 400:
url = r.headers["location"]
try:
response = requests.head(url)
except RequestException:
log.warning(f"Failed to resolve real URL: {entry.link}")
return None
if 300 < response.status_code < 400:
url = response.headers["location"]
else:
break
return url
@@ -175,7 +193,11 @@ def parse_entry_text_and_image(entry):
def load_and_parse_full_article_text_and_image(url):
pass
article = NewspaperArticle(url)
article.download()
article.parse()
article.nlp()
return article.summary, article.top_image
if __name__ == '__main__':

View File

@@ -152,7 +152,7 @@
font-size: 180%;
text-align: center;
border-bottom: solid 2px var(--text-color);
min-height: 40px;
min-height: 45px;
}
.is-block-header-dummy {
@@ -202,7 +202,7 @@
}
.feed {
padding: 30px 20px 0;
padding: 35px 20px 0;
}
.feed-title {

View File

@@ -62,7 +62,11 @@
<span class="article-tooltip-title">{{ article.title|truncatechars:100 }}</span>
{% if article.description and article.description|length > 20 %}
<span class="article-tooltip-description">
{{ article.description|truncatechars:300 }}
{% if article.summary %}
{{ article.summary|striptags|nl2br|truncatechars:300|safe }}
{% else %}
{{ article.description|truncatechars:300 }}
{% endif %}
</span>
{% endif %}
<span class="article-tooltip-info">{{ article.natural_created_at }}</span>

View File

@@ -4,7 +4,10 @@
{% block content %}
<div class="landing-top">
<h2 class="landing-top-title">Читай то, что читают другие. Формируй собственное инфополе.</h2>
<h2 class="landing-top-title">
Читай интернет так, как читают его другие.<br>
Формируй собственное инфополе.
</h2>
</div>
<div class="landing-boards">

View File

@@ -32,7 +32,7 @@
{% block footer %}
<div class="footer">
Сделал <a href="https://vas3k.ru">Вастрик</a>.<br><br>
Пэт-проджект <a href="https://vas3k.ru">Вастрика</a>.<br><br>
Сайт использует <a href="https://ru.wikipedia.org/wiki/Cookie" target="_blank">куки</a> для авторизации<br> и <a href="{% url "privacy_policy" %}">не собирает данные</a> больше, чем нужно.
{% if me %}
<br><a href="{% url "logout" %}" class="button logout-button">Выйти</a>