Add user-agent to article parser

This commit is contained in:
vas3k
2020-01-10 09:20:49 +01:00
parent 26704f5807
commit 61a3f3485f
4 changed files with 15 additions and 9 deletions

0
scripts/__init__.py Normal file
View File

7
scripts/common.py Normal file
View File

@@ -0,0 +1,7 @@
DEFAULT_REQUEST_HEADERS = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 "
"Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
}

View File

@@ -16,12 +16,7 @@ from bs4 import BeautifulSoup
from boards.models import Board, BoardFeed, BoardBlock
from utils.images import upload_image_from_url
DEFAULT_REQUEST_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
}
from scripts.common import DEFAULT_REQUEST_HEADERS
@click.command()

View File

@@ -20,15 +20,18 @@ import click
import feedparser
from bs4 import BeautifulSoup
from requests import RequestException
from newspaper import Article as NewspaperArticle, ArticleException
from newspaper import Article as NewspaperArticle, ArticleException, Config
from boards.models import BoardFeed, Article, Board
from scripts.common import DEFAULT_REQUEST_HEADERS
DEFAULT_NUM_WORKER_THREADS = 5
DEFAULT_ENTRIES_LIMIT = 100
MIN_REFRESH_DELTA = timedelta(minutes=30)
REQUEST_TIMEOUT = 10
MAX_PARSABLE_CONTENT_LENGTH = 5 * 1024 * 1024 # 5Mb
NEWSPAPER_CONFIG = Config()
NEWSPAPER_CONFIG.browser_user_agent = DEFAULT_REQUEST_HEADERS["User-Agent"]
log = logging.getLogger()
queue = queue.Queue()
@@ -121,7 +124,8 @@ def refresh_feed(item):
created_at=parse_datetime(entry),
updated_at=datetime.utcnow(),
title=entry_title[:256],
image=str(parse_image(entry) or "")[:512]
image=str(parse_image(entry) or "")[:512],
description=entry.get("summary"),
)
)
@@ -235,7 +239,7 @@ def parse_text_and_image(entry):
def load_and_parse_full_article_text_and_image(url):
article = NewspaperArticle(url)
article = NewspaperArticle(url, config=NEWSPAPER_CONFIG)
article.download()
article.parse()
article.nlp()