Streaming article loader + support empty Content-Lenght

This commit is contained in:
vas3k
2020-02-05 13:22:10 +01:00
parent 2cca90e62d
commit e3980c64f5
4 changed files with 39 additions and 23 deletions

View File

@@ -1,7 +1,15 @@
import socket
from urllib3.exceptions import InsecureRequestWarning
import requests
DEFAULT_REQUEST_HEADERS = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 "
"Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
}
DEFAULT_REQUEST_TIMEOUT = 10
MAX_PARSABLE_CONTENT_LENGTH = 15 * 1024 * 1024 # 15Mb
socket.setdefaulttimeout(DEFAULT_REQUEST_TIMEOUT)
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)

View File

@@ -13,14 +13,11 @@ import requests
import yaml
import feedparser
from bs4 import BeautifulSoup
from urllib3.exceptions import InsecureRequestWarning
from boards.models import Board, BoardFeed, BoardBlock
from utils.images import upload_image_from_url
from scripts.common import DEFAULT_REQUEST_HEADERS
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
@click.command()
@click.option("--config", default="boards.yml", help="Boards YAML file")

View File

@@ -1,3 +1,4 @@
import io
import os
import sys
import django
@@ -7,10 +8,8 @@ django.setup()
import re
import logging
import socket
from datetime import timedelta, datetime
from urllib.parse import urlparse
from urllib3.exceptions import InsecureRequestWarning
from time import mktime
import threading
import queue
@@ -20,25 +19,18 @@ import click
import feedparser
from bs4 import BeautifulSoup
from requests import RequestException
from newspaper import Article as NewspaperArticle, ArticleException, Config
from newspaper import Article as NewspaperArticle, ArticleException
from boards.models import BoardFeed, Article, Board
from scripts.common import DEFAULT_REQUEST_HEADERS
from scripts.common import DEFAULT_REQUEST_HEADERS, DEFAULT_REQUEST_TIMEOUT, MAX_PARSABLE_CONTENT_LENGTH
DEFAULT_NUM_WORKER_THREADS = 5
DEFAULT_ENTRIES_LIMIT = 100
MIN_REFRESH_DELTA = timedelta(minutes=30)
REQUEST_TIMEOUT = 10
MAX_PARSABLE_CONTENT_LENGTH = 15 * 1024 * 1024 # 15Mb
NEWSPAPER_CONFIG = Config()
NEWSPAPER_CONFIG.browser_user_agent = DEFAULT_REQUEST_HEADERS["User-Agent"]
log = logging.getLogger()
queue = queue.Queue()
socket.setdefaulttimeout(REQUEST_TIMEOUT)
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
@click.command()
@click.option("--num-workers", default=DEFAULT_NUM_WORKER_THREADS, help="Number of parser threads")
@@ -210,17 +202,16 @@ def resolve_url(entry_link):
depth -= 1
try:
response = requests.head(url, timeout=REQUEST_TIMEOUT, verify=False)
response = requests.head(url, timeout=DEFAULT_REQUEST_TIMEOUT, verify=False, stream=True)
except RequestException:
log.warning(f"Failed to resolve URL: {url}")
return None, content_type, content_length
if 300 < response.status_code < 400:
url = response.headers["location"]
url = response.headers["location"] # follow redirect
else:
content_type = response.headers.get("content-type")
content_length = int(response.headers.get("content-length")
or MAX_PARSABLE_CONTENT_LENGTH + 1)
content_length = int(response.headers.get("content-length") or 0)
break
return url, content_type, content_length
@@ -285,9 +276,29 @@ def parse_text_and_image(entry):
return text, ""
def load_page_safe(url):
response = requests.get(
url=url,
timeout=DEFAULT_REQUEST_TIMEOUT,
headers=DEFAULT_REQUEST_HEADERS,
stream=True # the most important part — stream response to prevent loading everything into memory
)
html = io.StringIO()
total_bytes = 0
for chunk in response.iter_content(chunk_size=100 * 1024, decode_unicode=True):
total_bytes += len(chunk)
if total_bytes >= MAX_PARSABLE_CONTENT_LENGTH:
return "" # reject too big pages
html.write(chunk)
return html.getvalue()
def load_and_parse_full_article_text_and_image(url):
article = NewspaperArticle(url, config=NEWSPAPER_CONFIG)
article.download()
article = NewspaperArticle(url)
article.set_html(load_page_safe(url)) # safer than article.download()
article.parse()
article.nlp()
return article.summary, article.top_image

View File

@@ -79,9 +79,9 @@
{% if article.description or article.summary %}
<span class="article-tooltip-description">
{% if article.summary %}
{{ article.summary|striptags|truncatechars:600|escape|nl2p|safe }}
{{ article.summary|striptags|truncatechars:700|escape|nl2p|safe }}
{% else %}
{{ article.description|striptags|truncatechars:600|escape|nl2p|safe }}
{{ article.description|striptags|truncatechars:700|escape|nl2p|safe }}
{% endif %}
</span>
{% endif %}