Add user-agent to article parser
This commit is contained in:
0
scripts/__init__.py
Normal file
0
scripts/__init__.py
Normal file
7
scripts/common.py
Normal file
7
scripts/common.py
Normal file
@@ -0,0 +1,7 @@
|
||||
DEFAULT_REQUEST_HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 "
|
||||
"Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
||||
}
|
||||
|
||||
|
||||
@@ -16,12 +16,7 @@ from bs4 import BeautifulSoup
|
||||
|
||||
from boards.models import Board, BoardFeed, BoardBlock
|
||||
from utils.images import upload_image_from_url
|
||||
|
||||
|
||||
DEFAULT_REQUEST_HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
|
||||
}
|
||||
from scripts.common import DEFAULT_REQUEST_HEADERS
|
||||
|
||||
|
||||
@click.command()
|
||||
|
||||
@@ -20,15 +20,18 @@ import click
|
||||
import feedparser
|
||||
from bs4 import BeautifulSoup
|
||||
from requests import RequestException
|
||||
from newspaper import Article as NewspaperArticle, ArticleException
|
||||
from newspaper import Article as NewspaperArticle, ArticleException, Config
|
||||
|
||||
from boards.models import BoardFeed, Article, Board
|
||||
from scripts.common import DEFAULT_REQUEST_HEADERS
|
||||
|
||||
DEFAULT_NUM_WORKER_THREADS = 5
|
||||
DEFAULT_ENTRIES_LIMIT = 100
|
||||
MIN_REFRESH_DELTA = timedelta(minutes=30)
|
||||
REQUEST_TIMEOUT = 10
|
||||
MAX_PARSABLE_CONTENT_LENGTH = 5 * 1024 * 1024 # 5Mb
|
||||
NEWSPAPER_CONFIG = Config()
|
||||
NEWSPAPER_CONFIG.browser_user_agent = DEFAULT_REQUEST_HEADERS["User-Agent"]
|
||||
|
||||
log = logging.getLogger()
|
||||
queue = queue.Queue()
|
||||
@@ -121,7 +124,8 @@ def refresh_feed(item):
|
||||
created_at=parse_datetime(entry),
|
||||
updated_at=datetime.utcnow(),
|
||||
title=entry_title[:256],
|
||||
image=str(parse_image(entry) or "")[:512]
|
||||
image=str(parse_image(entry) or "")[:512],
|
||||
description=entry.get("summary"),
|
||||
)
|
||||
)
|
||||
|
||||
@@ -235,7 +239,7 @@ def parse_text_and_image(entry):
|
||||
|
||||
|
||||
def load_and_parse_full_article_text_and_image(url):
|
||||
article = NewspaperArticle(url)
|
||||
article = NewspaperArticle(url, config=NEWSPAPER_CONFIG)
|
||||
article.download()
|
||||
article.parse()
|
||||
article.nlp()
|
||||
|
||||
Reference in New Issue
Block a user