From 61a3f3485f656772440f40196ffe61be028168ea Mon Sep 17 00:00:00 2001 From: vas3k Date: Fri, 10 Jan 2020 09:20:49 +0100 Subject: [PATCH] Add user-agent to article parser --- scripts/__init__.py | 0 scripts/common.py | 7 +++++++ scripts/initialize.py | 7 +------ scripts/update.py | 10 +++++++--- 4 files changed, 15 insertions(+), 9 deletions(-) create mode 100644 scripts/__init__.py create mode 100644 scripts/common.py diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/common.py b/scripts/common.py new file mode 100644 index 0000000..204c457 --- /dev/null +++ b/scripts/common.py @@ -0,0 +1,7 @@ +DEFAULT_REQUEST_HEADERS = { + "User-Agent": "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 " + "Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" +} + + diff --git a/scripts/initialize.py b/scripts/initialize.py index 1ede462..eea6058 100644 --- a/scripts/initialize.py +++ b/scripts/initialize.py @@ -16,12 +16,7 @@ from bs4 import BeautifulSoup from boards.models import Board, BoardFeed, BoardBlock from utils.images import upload_image_from_url - - -DEFAULT_REQUEST_HEADERS = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" -} +from scripts.common import DEFAULT_REQUEST_HEADERS @click.command() diff --git a/scripts/update.py b/scripts/update.py index 6fec7b0..4b48a8b 100644 --- a/scripts/update.py +++ b/scripts/update.py @@ -20,15 +20,18 @@ import click import feedparser from bs4 import BeautifulSoup from requests import RequestException -from newspaper import Article as NewspaperArticle, ArticleException +from newspaper import Article as NewspaperArticle, ArticleException, Config from boards.models import BoardFeed, Article, Board +from scripts.common import DEFAULT_REQUEST_HEADERS DEFAULT_NUM_WORKER_THREADS = 5 DEFAULT_ENTRIES_LIMIT = 100 MIN_REFRESH_DELTA = timedelta(minutes=30) REQUEST_TIMEOUT = 10 MAX_PARSABLE_CONTENT_LENGTH = 5 * 1024 * 1024 # 5Mb +NEWSPAPER_CONFIG = Config() +NEWSPAPER_CONFIG.browser_user_agent = DEFAULT_REQUEST_HEADERS["User-Agent"] log = logging.getLogger() queue = queue.Queue() @@ -121,7 +124,8 @@ def refresh_feed(item): created_at=parse_datetime(entry), updated_at=datetime.utcnow(), title=entry_title[:256], - image=str(parse_image(entry) or "")[:512] + image=str(parse_image(entry) or "")[:512], + description=entry.get("summary"), ) ) @@ -235,7 +239,7 @@ def parse_text_and_image(entry): def load_and_parse_full_article_text_and_image(url): - article = NewspaperArticle(url) + article = NewspaperArticle(url, config=NEWSPAPER_CONFIG) article.download() article.parse() article.nlp()