Make article summarizer
This commit is contained in:
@@ -62,7 +62,7 @@ boards:
|
||||
- name: MIT Technology Review
|
||||
rss: https://www.technologyreview.com/topnews.rss
|
||||
url: https://www.technologyreview.com
|
||||
- name: Мейкерство
|
||||
- name: Инди-разработка
|
||||
slug: make
|
||||
feeds:
|
||||
- name: Show HN
|
||||
|
||||
18
boards/migrations/0002_article_summary.py
Normal file
18
boards/migrations/0002_article_summary.py
Normal file
@@ -0,0 +1,18 @@
|
||||
# Generated by Django 2.2.8 on 2020-01-05 22:42
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('boards', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='article',
|
||||
name='summary',
|
||||
field=models.TextField(null=True),
|
||||
),
|
||||
]
|
||||
@@ -137,6 +137,7 @@ class Article(models.Model):
|
||||
title = models.CharField(max_length=256)
|
||||
image = models.URLField(max_length=512, null=True)
|
||||
description = models.TextField(null=True)
|
||||
summary = models.TextField(null=True)
|
||||
|
||||
created_at = models.DateTimeField(db_index=True)
|
||||
updated_at = models.DateTimeField()
|
||||
|
||||
@@ -15,6 +15,17 @@ def pretty_url(value):
|
||||
return re.sub(r"https?://(www\.)?", "", value, 1)
|
||||
|
||||
|
||||
@register.filter(is_safe=True)
|
||||
def nl2br(text):
|
||||
"""
|
||||
Replaces \n to <br/>
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
text = text.replace("\n", "<br/>")
|
||||
return text
|
||||
|
||||
|
||||
@register.filter
|
||||
def cool_number(value, num_decimals=1):
|
||||
"""
|
||||
|
||||
@@ -88,7 +88,7 @@ CSS_HASH = str(random())
|
||||
# App settings
|
||||
|
||||
APP_NAME = "Infomate"
|
||||
APP_TITLE = "Читай то, что читают другие"
|
||||
APP_TITLE = "Читай, что читают другие"
|
||||
APP_DESCRIPTION = ""
|
||||
APP_HOST = "https://infomate.club"
|
||||
|
||||
|
||||
@@ -7,3 +7,4 @@ pyyaml==5.2
|
||||
feedparser==5.2.1
|
||||
sentry-sdk==0.13.5
|
||||
pyjwt==1.7.1
|
||||
newspaper3k>=0.2.8
|
||||
|
||||
@@ -18,6 +18,8 @@ import requests
|
||||
import click
|
||||
import feedparser
|
||||
from bs4 import BeautifulSoup
|
||||
from requests import RequestException
|
||||
from newspaper import Article as NewspaperArticle
|
||||
|
||||
from boards.models import BoardFeed, Article, Board
|
||||
|
||||
@@ -81,7 +83,10 @@ def worker():
|
||||
if task is None:
|
||||
break
|
||||
|
||||
refresh_feed(task)
|
||||
try:
|
||||
refresh_feed(task)
|
||||
except Exception:
|
||||
pass # to avoid infinite wait in .join()
|
||||
|
||||
queue.task_done()
|
||||
|
||||
@@ -106,21 +111,27 @@ def refresh_feed(item):
|
||||
|
||||
if is_created:
|
||||
# parse heavy info
|
||||
try:
|
||||
real_url = resolve_real_url(entry)
|
||||
real_url = resolve_real_url(entry)
|
||||
|
||||
if real_url:
|
||||
article.url = real_url[:2000]
|
||||
article.domain = parse_domain(real_url)[:256]
|
||||
except ConnectionError:
|
||||
log.warning(f"Failed to resolve real URL: {entry.link}")
|
||||
|
||||
summary, lead_image = parse_entry_text_and_image(entry)
|
||||
text, lead_image = parse_entry_text_and_image(entry)
|
||||
|
||||
if summary:
|
||||
article.description = summary[:1000]
|
||||
if text:
|
||||
article.description = text[:1000]
|
||||
|
||||
if lead_image:
|
||||
article.image = lead_image[:512]
|
||||
|
||||
summary, summary_image = load_and_parse_full_article_text_and_image(article.url)
|
||||
|
||||
article.summary = summary
|
||||
|
||||
if summary_image:
|
||||
article.image = summary_image[:512]
|
||||
|
||||
article.save()
|
||||
|
||||
week_ago = datetime.utcnow() - timedelta(days=7)
|
||||
@@ -139,11 +150,18 @@ def resolve_real_url(entry):
|
||||
depth = 10
|
||||
while depth > 0:
|
||||
depth -= 1
|
||||
r = requests.head(url)
|
||||
if 300 < r.status_code < 400:
|
||||
url = r.headers["location"]
|
||||
|
||||
try:
|
||||
response = requests.head(url)
|
||||
except RequestException:
|
||||
log.warning(f"Failed to resolve real URL: {entry.link}")
|
||||
return None
|
||||
|
||||
if 300 < response.status_code < 400:
|
||||
url = response.headers["location"]
|
||||
else:
|
||||
break
|
||||
|
||||
return url
|
||||
|
||||
|
||||
@@ -175,7 +193,11 @@ def parse_entry_text_and_image(entry):
|
||||
|
||||
|
||||
def load_and_parse_full_article_text_and_image(url):
|
||||
pass
|
||||
article = NewspaperArticle(url)
|
||||
article.download()
|
||||
article.parse()
|
||||
article.nlp()
|
||||
return article.summary, article.top_image
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -152,7 +152,7 @@
|
||||
font-size: 180%;
|
||||
text-align: center;
|
||||
border-bottom: solid 2px var(--text-color);
|
||||
min-height: 40px;
|
||||
min-height: 45px;
|
||||
}
|
||||
|
||||
.is-block-header-dummy {
|
||||
@@ -202,7 +202,7 @@
|
||||
}
|
||||
|
||||
.feed {
|
||||
padding: 30px 20px 0;
|
||||
padding: 35px 20px 0;
|
||||
}
|
||||
|
||||
.feed-title {
|
||||
|
||||
@@ -62,7 +62,11 @@
|
||||
<span class="article-tooltip-title">{{ article.title|truncatechars:100 }}</span>
|
||||
{% if article.description and article.description|length > 20 %}
|
||||
<span class="article-tooltip-description">
|
||||
{{ article.description|truncatechars:300 }}
|
||||
{% if article.summary %}
|
||||
{{ article.summary|striptags|nl2br|truncatechars:300|safe }}
|
||||
{% else %}
|
||||
{{ article.description|truncatechars:300 }}
|
||||
{% endif %}
|
||||
</span>
|
||||
{% endif %}
|
||||
<span class="article-tooltip-info">{{ article.natural_created_at }}</span>
|
||||
|
||||
@@ -4,7 +4,10 @@
|
||||
|
||||
{% block content %}
|
||||
<div class="landing-top">
|
||||
<h2 class="landing-top-title">Читай то, что читают другие. Формируй собственное инфополе.</h2>
|
||||
<h2 class="landing-top-title">
|
||||
Читай интернет так, как читают его другие.<br>
|
||||
Формируй собственное инфополе.
|
||||
</h2>
|
||||
</div>
|
||||
|
||||
<div class="landing-boards">
|
||||
|
||||
@@ -32,7 +32,7 @@
|
||||
|
||||
{% block footer %}
|
||||
<div class="footer">
|
||||
Сделал <a href="https://vas3k.ru">Вастрик</a>.<br><br>
|
||||
Пэт-проджект <a href="https://vas3k.ru">Вастрика</a>.<br><br>
|
||||
Сайт использует <a href="https://ru.wikipedia.org/wiki/Cookie" target="_blank">куки</a> для авторизации<br> и <a href="{% url "privacy_policy" %}">не собирает данные</a> больше, чем нужно.
|
||||
{% if me %}
|
||||
<br><a href="{% url "logout" %}" class="button logout-button">Выйти</a>
|
||||
|
||||
Reference in New Issue
Block a user