mkdocs-material/material/plugins/privacy/plugin.py

# Copyright (c) 2016-2024 Martin Donath <martin.donath@squidfunk.com>

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.

from __future__ import annotations

import errno
import logging
import os
import posixpath
import re
import requests
import sys

from colorama import Fore, Style
from concurrent.futures import Future, ThreadPoolExecutor, wait
from hashlib import sha1
from mkdocs.config.config_options import ExtraScriptValue
from mkdocs.config.defaults import MkDocsConfig
from mkdocs.exceptions import PluginError
from mkdocs.plugins import BasePlugin, event_priority
from mkdocs.structure.files import File, Files
from mkdocs.utils import is_error_template
from re import Match
from urllib.parse import ParseResult as URL, urlparse, unquote
from xml.etree.ElementTree import Element, tostring

from .config import PrivacyConfig
from .parser import FragmentParser

# -----------------------------------------------------------------------------
# Classes
# -----------------------------------------------------------------------------

# Privacy plugin
class PrivacyPlugin(BasePlugin[PrivacyConfig]):

    # Initialize thread pools and asset collections
    def on_config(self, config):
        self.site = urlparse(config.site_url or "")
        if not self.config.enabled:
            return

        # Initialize thread pool
        self.pool = ThreadPoolExecutor(self.config.concurrency)
        self.pool_jobs: list[Future] = []

        # Initialize collections of external assets
        self.assets = Files([])
        self.assets_expr_map = {
            ".css": r"url\((\s*http?[^)]+)\)",
            ".js": r"[\"'](http[^\"']+\.(?:css|js(?:on)?))[\"']",
            **self.config.assets_expr_map
        }

    # Process external style sheets and scripts (run latest) - run this after
    # all other plugins, so they can add additional assets
    @event_priority(-100)
    def on_files(self, files, *, config):
        if not self.config.enabled:
            return

        # Skip if external assets must not be processed
        if not self.config.assets:
            return

        # Find all external style sheet and script files that are provided as
        # part of the build (= already known to MkDocs on startup)
        for initiator in files.media_files():
            file = None

            # Check if the file has dependent external assets that must be
            # downloaded. Create and enqueue a job for each external asset.
            for url in self._parse_media(initiator):
                if not self._is_excluded(url, initiator):
                    file = self._queue(url, config, concurrent = True)

                    # If site URL is not given, ensure that Mermaid.js is always
                    # present. This is a special case, as Material for MkDocs
                    # automatically loads Mermaid.js when a Mermaid diagram is
                    # found in the page - https://bit.ly/36tZXsA.
                    if "mermaid.min.js" in url.path and not config.site_url:
                        path = url.geturl()
                        if path not in config.extra_javascript:
                            config.extra_javascript.append(
                                ExtraScriptValue(path)
                            )

            # The local asset references at least one external asset, which
            # means we must download and replace them later
            if file:
                self.assets.append(initiator)
                files.remove(initiator)

        # Process external style sheet files
        for path in config.extra_css:
            url = urlparse(path)
            if not self._is_excluded(url):
                self._queue(url, config, concurrent = True)

        # Process external script files
        for script in config.extra_javascript:
            if isinstance(script, str):
                script = ExtraScriptValue(script)

            # Enqueue a job if the script needs to downloaded
            url = urlparse(script.path)
            if not self._is_excluded(url):
                self._queue(url, config, concurrent = True)

    # Process external images in page (run latest) - this stage is the earliest
    # we can start processing external images, since images are the most common
    # type of external asset when writing. Thus, we create and enqueue a job for
    # each image we find that checks if the image needs to be downloaded.
    @event_priority(-100)
    def on_page_content(self, html, *, page, config, files):
        if not self.config.enabled:
            return

        # Skip if external assets must not be processed
        if not self.config.assets:
            return

        # Find all external images and download them if not excluded
        for match in re.findall(
            r"<img[^>]+src=['\"]?http[^>]+>",
            html, flags = re.I | re.M
        ):
            el = self._parse_fragment(match)

            # Create and enqueue job to fetch external image
            url = urlparse(el.get("src"))
            if not self._is_excluded(url, page.file):
                self._queue(url, config, concurrent = True)

    # Process external assets in template (run later)
    @event_priority(-50)
    def on_post_template(self, output_content, *, template_name, config):
        if not self.config.enabled:
            return

        # Skip sitemap.xml and other non-HTML files
        if not template_name.endswith(".html"):
            return

        # Parse and replace links to external assets in template
        initiator = File(template_name, config.docs_dir, config.site_dir, False)
        return self._parse_html(output_content, initiator, config)

    # Process external assets in page (run later)
    @event_priority(-50)
    def on_post_page(self, output, *, page, config):
        if not self.config.enabled:
            return

        # Parse and replace links to external assets
        return self._parse_html(output, page.file, config)

    # Reconcile jobs (run earlier)
    @event_priority(50)
    def on_post_build(self, *, config):
        if not self.config.enabled:
            return

        # Reconcile concurrent jobs and clear thread pool, as we will reuse the
        # same thread pool for patching all links to external assets
        wait(self.pool_jobs)
        self.pool_jobs.clear()

        # Spawn concurrent job to patch all links to dependent external asset
        # in all style sheet and script files
        for file in self.assets:
            _, extension = posixpath.splitext(file.dest_uri)
            if extension in [".css", ".js"]:
                self.pool_jobs.append(self.pool.submit(
                    self._patch, file
                ))

            # Otherwise just copy external asset to output directory
            else:
                file.copy_file()

        # Reconcile concurrent jobs for the last time, so the plugins following
        # in the build process always have a consistent state to work with
        wait(self.pool_jobs)
        self.pool.shutdown()

    # -------------------------------------------------------------------------

    # Check if the given URL is external
    def _is_external(self, url: URL):
        hostname = url.hostname or self.site.hostname
        return hostname != self.site.hostname

    # Check if the given URL is excluded
    def _is_excluded(self, url: URL, initiator: File | None = None):
        if not self._is_external(url):
            return True

        # Skip if external assets must not be processed
        if not self.config.assets:
            return True

        # If initiator is given, format for printing
        via = ""
        if initiator:
            via = "".join([
                Fore.WHITE, Style.DIM,
                f"in '{initiator.src_uri}' ",
                Style.RESET_ALL
            ])

        # Print warning if fetching is not enabled
        if not self.config.assets_fetch:
            log.warning(f"External file: {url.geturl()} {via}")
            return True

        # File is not excluded
        return False

    # -------------------------------------------------------------------------

    # Parse a fragment
    def _parse_fragment(self, fragment: str):
        parser = FragmentParser()
        parser.feed(fragment)
        parser.close()

        # Check parse result and return element
        if isinstance(parser.result, Element):
            return parser.result

        # Otherwise, raise a plugin error - if the author accidentally used
        # invalid HTML inside of the tag, e.g., forget a opening or closing
        # quote, we need to catch this here, as we're using pretty basic
        # regular expression based extraction
        raise PluginError(
            f"Could not parse due to possible syntax error in HTML: \n\n"
            + fragment
        )

    # Parse and extract all external assets from a media file using a preset
    # regular expression, and return all URLs found.
    def _parse_media(self, initiator: File) -> list[URL]:
        _, extension = posixpath.splitext(initiator.dest_uri)
        if extension not in self.assets_expr_map:
            return []

        # Find and extract all external asset URLs
        expr = re.compile(self.assets_expr_map[extension], flags = re.I | re.M)
        with open(initiator.abs_src_path, encoding = "utf-8-sig") as f:
            return [urlparse(url) for url in re.findall(expr, f.read())]

    # Parse template or page HTML and find all external links that need to be
    # replaced. Many of the assets should already be downloaded earlier, i.e.,
    # everything that was directly referenced in the document, but there may
    # still exist external assets that were added by third-party plugins.
    def _parse_html(self, output: str, initiator: File, config: MkDocsConfig):

        # Resolve callback
        def resolve(file: File):
            if is_error_template(initiator.src_uri):
                base = urlparse(config.site_url or "/")
                return posixpath.join(base.path, file.url)
            else:
                return file.url_relative_to(initiator)

        # Replace callback
        def replace(match: Match):
            el = self._parse_fragment(match.group())

            # Handle external style sheet or preconnect hint
            if el.tag == "link":
                url = urlparse(el.get("href"))
                if not self._is_excluded(url, initiator):
                    rel = el.get("rel", "")

                    # Replace external preconnect hint
                    if rel == "preconnect":
                        return ""

                    # Replace external style sheet or favicon
                    if rel == "stylesheet" or rel == "icon":
                        file = self._queue(url, config)
                        el.set("href", resolve(file))

            # Handle external script or image
            if el.tag == "script" or el.tag == "img":
                url = urlparse(el.get("src"))
                if not self._is_excluded(url, initiator):
                    file = self._queue(url, config)
                    el.set("src", resolve(file))

            # Return element as string
            return self._print(el)

        # Find and replace all external asset URLs in current page
        return re.sub(
            r"<(?:(?:a|link)[^>]+href|(?:script|img)[^>]+src)=['\"]?http[^>]+>",
            replace, output, flags = re.I | re.M
        )

    # -------------------------------------------------------------------------

    # Print element as string - what could possibly go wrong? We're parsing
    # HTML5 with an XML parser, and XML doesn't allow for boolean attributes,
    # which is why we must add a dummy value to all attributes that are not
    # strings before printing the element as string.
    def _print(self, el: Element):
        temp = "__temp__"
        for name in el.attrib:
            if not isinstance(el.attrib[name], str):
                el.attrib[name] = temp

        # Return void or opening tag as string, strip closing tag
        data = tostring(el, encoding = "unicode")
        return data.replace(" />", ">").replace(f"\"{temp}\"", "")

    # Enqueue external asset for download, if not already done
    def _queue(self, url: URL, config: MkDocsConfig, concurrent = False):
        path = self._path_from_url(url)
        full = posixpath.join(self.config.assets_fetch_dir, path)

        # Try to retrieve existing file
        file = self.assets.get_file_from_path(full)
        if not file:

            # Compute path to external asset, which is sourced from the cache
            # directory, and generate file to register it with MkDocs as soon
            # as it was downloaded. This allows other plugins to apply
            # additional processing.
            file = self._path_to_file(path, config)
            file.url = url.geturl()

            # Spawn concurrent job to fetch external asset if the extension is
            # known and the concurrent flag is set. In that case, this function
            # is called in a context where no replacements are carried out, so
            # the caller must only ensure to reconcile the concurrent jobs.
            _, extension = posixpath.splitext(url.path)
            if extension and concurrent:
                self.pool_jobs.append(self.pool.submit(
                    self._fetch, file, config
                ))

            # Fetch external asset synchronously, as it either has no extension
            # or is fetched from a context in which replacements are done
            else:
                self._fetch(file, config)

            # Register external asset as file - it might have already been
            # registered, and since MkDocs 1.6, trigger a deprecation warning
            if not self.assets.get_file_from_path(file.src_uri):
                self.assets.append(file)

        # If the URL of the external asset includes a hash fragment, add it to
        # the returned file, e.g. for dark/light images - see https://t.ly/7b16Y
        if url.fragment:
            file.url += f"#{url.fragment}"

        # Return file associated with external asset
        return file

    # Fetch external asset referenced through the given file
    def _fetch(self, file: File, config: MkDocsConfig):

        # Check if external asset needs to be downloaded
        if not os.path.isfile(file.abs_src_path) or not self.config.cache:
            path = file.abs_src_path

            # Download external asset
            log.info(f"Downloading external file: {file.url}")
            res = requests.get(file.url, headers = {

                # Set user agent explicitly, so Google Fonts gives us *.woff2
                # files, which according to caniuse.com is the only format we
                # need to download as it covers the entire range of browsers
                # we're officially supporting.
                "User-Agent": " ".join([
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
                    "AppleWebKit/537.36 (KHTML, like Gecko)",
                    "Chrome/98.0.4758.102 Safari/537.36"
                ])
            })

            # Compute expected file extension and append if missing
            mime = res.headers["content-type"].split(";")[0]
            extension = extensions.get(mime)
            if extension and not path.endswith(extension):
                path += extension

            # Save to file and create symlink if no extension was present
            self._save_to_file(path, res.content)
            if path != file.abs_src_path:

                # Creating symlinks might fail on Windows. Thus, we just print
                # a warning and continue - see https://bit.ly/3xYFzcZ
                try:
                    os.symlink(os.path.basename(path), file.abs_src_path)
                except OSError as e:
                    if e.errno != errno.EEXIST:
                        log.warning(
                            f"Couldn't create symbolic link: {file.src_uri}"
                        )

                    # Fall back for when the symlink could not be created. This
                    # means that the plugin will download the original file on
                    # every build, as the content type cannot be resolved from
                    # the file extension.
                    file.abs_src_path = path

        # Resolve destination if file points to a symlink
        _, extension = os.path.splitext(file.abs_src_path)
        if os.path.isfile(file.abs_src_path):
            file.abs_src_path = os.path.realpath(file.abs_src_path)
            _, extension = os.path.splitext(file.abs_src_path)

            # If the symlink could not be created, we already set the correct
            # extension, so we need to make sure not to append it again
            if not file.abs_dest_path.endswith(extension):
                file.src_uri += extension

                # Compute destination file system path
                file.dest_uri += extension
                file.abs_dest_path += extension

        # Compute destination URL
        file.url = file.dest_uri

        # Parse and enqueue dependent external assets
        for url in self._parse_media(file):
            if not self._is_excluded(url, file):
                self._queue(url, config, concurrent = True)

    # Patch all links to external assets in the given file
    def _patch(self, initiator: File):
        with open(initiator.abs_src_path, encoding = "utf-8-sig") as f:

            # Replace callback
            def replace(match: Match):
                value = match.group(1)

                # Map URL to canonical path
                path = self._path_from_url(urlparse(value))
                full = posixpath.join(self.config.assets_fetch_dir, path)

                # Try to retrieve existing file
                file = self.assets.get_file_from_path(full)
                if not file:
                    name = os.readlink(os.path.join(self.config.cache_dir, full))
                    full = posixpath.join(posixpath.dirname(full), name)

                    # Try again after resolving symlink
                    file = self.assets.get_file_from_path(full)

                # This can theoretically never happen, as we're sure that we
                # only replace files that we successfully extracted. However,
                # we might have missed several cases, so it's better to throw
                # here than to swallow the error.
                if not file:
                    log.error(
                        "File not found. This is likely a bug in the built-in "
                        "privacy plugin. Please create an issue with a minimal "
                        "reproduction."
                    )
                    sys.exit(1)

                # Create absolute URL for asset in script
                if file.url.endswith(".js"):
                    url = posixpath.join(self.site.geturl(), file.url)

                # Create relative URL for everything else
                else:
                    url = file.url_relative_to(initiator)

                # Switch external asset URL to local path
                return match.group().replace(value, url)

            # Resolve replacement expression according to asset type
            _, extension = posixpath.splitext(initiator.dest_uri)
            expr = re.compile(self.assets_expr_map[extension], re.I | re.M)

            # Resolve links to external assets in file
            self._save_to_file(
                initiator.abs_dest_path,
                expr.sub(replace, f.read())
            )

    # -------------------------------------------------------------------------

    # Normalize (= canonicalize) path by removing trailing slashes, and ensure
    # that hidden folders (`.` after `/`) are unhidden. Otherwise MkDocs will
    # not consider them being part of the build and refuse to copy them.
    def _path_from_url(self, url: URL):
        path = posixpath.normpath(url.path)
        path = re.sub(r"/\.", "/_", path)

        # Compute digest of query string, as some URLs yield different results
        # for different query strings, e.g. https://unsplash.com/random?Coffee
        if url.query:
            name, extension = posixpath.splitext(path)

            # Inject digest after file name and before file extension, as
            # done for style sheet and script files as well
            digest = sha1(url.query.encode("utf-8")).hexdigest()[:8]
            path = f"{name}.{digest}{extension}"

        # Create and return URL without leading double slashes
        url = url._replace(scheme = "", query = "", fragment = "", path = path)
        return url.geturl()[2:]

    # Create a file for the given path
    def _path_to_file(self, path: str, config: MkDocsConfig):
        return File(
            posixpath.join(self.config.assets_fetch_dir, unquote(path)),
            os.path.abspath(self.config.cache_dir),
            config.site_dir,
            False
        )

    # Create a file on the system with the given content
    def _save_to_file(self, path: str, content: str | bytes):
        os.makedirs(os.path.dirname(path), exist_ok = True)
        if isinstance(content, str):
            content = bytes(content, "utf-8")
        with open(path, "wb") as f:
            f.write(content)

# -----------------------------------------------------------------------------
# Data
# -----------------------------------------------------------------------------

# Set up logging
log = logging.getLogger("mkdocs.material.privacy")

# Expected file extensions
extensions = {
    "application/javascript": ".js",
    "image/avif": ".avif",
    "image/gif": ".gif",
    "image/jpeg": ".jpg",
    "image/png": ".png",
    "image/svg+xml": ".svg",
    "image/webp": ".webp",
    "text/javascript": ".js",
    "text/css": ".css"
}