Files
infomate.club/scripts/initialize.py

207 lines
7.1 KiB
Python

import os
import sys
import django
BASE_DIR = os.path.join(os.path.dirname(__file__), "..")
sys.path.append(BASE_DIR)
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "infomate.settings")
django.setup()
from urllib.parse import urljoin
import click
import requests
import yaml
from bs4 import BeautifulSoup
from boards.models import Board, BoardFeed, BoardBlock
from boards.icons import DOMAIN_FAVICONS
from utils.images import upload_image_from_url
from scripts.common import DEFAULT_REQUEST_HEADERS, parse_domain
@click.command()
@click.option("--config", default="boards.yml", help="Boards YAML file")
@click.option("--board-slug", default=None, help="Board slug to parse only one exact board")
@click.option("--upload-favicons/--no-upload-favicons", default=False, help="Upload favicons")
@click.option("-y", "always_yes", is_flag=True, help="Don't ask any questions (good for scripts)")
def initialize(config, board_slug, upload_favicons, always_yes):
yaml_file = os.path.join(BASE_DIR, config)
with open(yaml_file) as f:
try:
config = yaml.load(f.read(), Loader=yaml.FullLoader)
except yaml.YAMLError as ex:
print(f"Bad YAML file '{yaml_file}': {ex}")
exit(1)
if not always_yes:
input(f"Initializing feeds from {yaml_file}. Press Enter to continue...")
for board_index, board_config in enumerate(config.get("boards") or []):
if board_slug and board_config["slug"] != board_slug:
continue
board_name = board_config.get("name") or board_config["slug"]
print(f"Creating board: {board_name}...")
board, is_created = Board.objects.update_or_create(
slug=board_config["slug"],
defaults=dict(
name=board_name or board_config["slug"],
avatar=board_config["curator"].get("avatar"),
curator_name=board_config["curator"].get("name"),
curator_title=board_config["curator"].get("title"),
curator_footer=board_config["curator"].get("footer"),
curator_bio=board_config["curator"].get("bio"),
curator_url=board_config["curator"].get("url"),
is_private=board_config.get("is_private"),
is_visible=board_config.get("is_visible"),
index=board_index,
)
)
for block_index, block_config in enumerate(board_config.get("blocks") or []):
block_name = block_config.get("name") or ""
print(f"\nCreating block: {block_name}...")
block, is_created = BoardBlock.objects.update_or_create(
board=board,
slug=block_config["slug"],
defaults=dict(
name=block_name,
index=block_index,
view=block_config.get("view") or BoardBlock.DEFAULT_VIEW,
)
)
if not block_config.get("feeds"):
continue
updated_feed_urls = set()
for feed_index, feed_config in enumerate(block_config.get("feeds") or []):
feed_name = feed_config.get("name")
feed_mix = feed_config.get("mix")
if feed_mix:
feed_url = feed_config.get("url") or f"mix:{'|'.join(feed_mix)}"
feed_rss = None
else:
feed_url = feed_config["url"]
feed_rss = feed_config["rss"]
updated_feed_urls.add(feed_url)
print(f"Creating or updating feed {feed_name} ({feed_url})...")
feed, is_created = BoardFeed.objects.update_or_create(
board=board,
block=block,
url=feed_url,
defaults=dict(
rss=feed_rss,
mix=feed_mix,
name=feed_name,
comment=feed_config.get("comment"),
icon=feed_config.get("icon"),
index=feed_index,
columns=feed_config.get("columns") or 1,
conditions=feed_config.get("conditions"),
filters=feed_config.get("filters"),
is_parsable=feed_config.get("is_parsable", True),
view=feed_config.get("view") or BoardFeed.DEFAULT_VIEW,
)
)
html = None
if not feed.mix:
if not feed.icon:
feed.icon = DOMAIN_FAVICONS.get(parse_domain(feed_url))
if not feed.icon:
html = html or load_page_html(feed_url)
icon = feed_config.get("icon")
if not icon and html:
icon = find_favicon(feed_url, html)
print(f"- found favicon: {icon}")
if upload_favicons:
icon = upload_image_from_url(icon)
print(f"- uploaded favicon: {icon}")
feed.icon = icon[:512] if icon else None
feed.save()
# delete unused feeds
BoardFeed.objects.filter(
board=board,
block=block
).exclude(
url__in=updated_feed_urls
).delete()
# delete unused blocks
BoardBlock.objects.filter(
board=board,
).exclude(
slug__in={block["slug"] for block in board_config.get("blocks") or []}
).delete()
print("Done ✅")
def load_page_html(url):
try:
return requests.get(
url=url,
headers=DEFAULT_REQUEST_HEADERS,
allow_redirects=True,
timeout=30,
verify=False
).text
except Exception as ex:
print(f"🚨 Error loading page {url}: {ex}")
# def find_rss_feed(url, html):
# bs = BeautifulSoup(html, features="lxml")
# possible_feeds = set()
#
# feed_urls = bs.findAll("link", rel="alternate")
# for feed_url in feed_urls:
# t = feed_url.get("type", None)
# if t:
# if "rss" in t or "xml" in t:
# href = feed_url.get("href", None)
# if href:
# possible_feeds.add(urljoin(url, href))
#
# a_tags = bs.findAll("a")
# for a in a_tags:
# href = a.get("href", None)
# if href:
# if "xml" in href or "rss" in href or "feed" in href:
# possible_feeds.add(urljoin(url, href))
#
# for feed_url in possible_feeds:
# feed = feedparser.parse(feed_url)
# if feed.entries:
# return feed_url
#
# return None
def find_favicon(url, html):
bs = BeautifulSoup(html, features="lxml")
link_tags = bs.findAll("link")
for link_tag in link_tags:
rel = link_tag.get("rel", None)
if rel and "icon" in rel:
href = link_tag.get("href", None)
if href:
return urljoin(url, href)
return None
if __name__ == '__main__':
initialize()