# Copyright (c) 2016-2023 Martin Donath <martin.donath@squidfunk.com>

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.

import json
import logging
import os
import regex as re

from html import escape
from html.parser import HTMLParser
from mkdocs import utils
from mkdocs.config.config_options import SubConfig
from mkdocs.plugins import BasePlugin

from material.plugins.search.config import SearchConfig, SearchFieldConfig

try:
    import jieba
except ImportError:
    jieba = None

# -----------------------------------------------------------------------------
# Class
# -----------------------------------------------------------------------------

# Search plugin
class SearchPlugin(BasePlugin[SearchConfig]):

    # Determine whether we're running under dirty reload
    def on_startup(self, *, command, dirty):
        self.is_dirtyreload = False
        self.is_dirty = dirty

        # Initialize search index cache
        self.search_index_prev = None

    # Initialize plugin
    def on_config(self, config):
        if not self.config.lang:
            self.config.lang = [self._translate(
                config, "search.config.lang"
            )]

        # Retrieve default value for separator
        if not self.config.separator:
            self.config.separator = self._translate(
                config, "search.config.separator"
            )

        # Retrieve default value for pipeline
        if not self.config.pipeline:
            self.config.pipeline = list(filter(len, re.split(
                r"\s*,\s*", self._translate(config, "search.config.pipeline")
            )))

        # Initialize search index
        self.search_index = SearchIndex(**self.config)

        # Set jieba dictionary, if given
        if self.config.jieba_dict:
            path = os.path.normpath(self.config.jieba_dict)
            if os.path.exists(path):
                jieba.set_dictionary(path)
                log.debug(f"Loading jieba dictionary: {path}")
            else:
                log.warning(
                    f"Configuration error for 'search.jieba_dict': "
                    f"'{self.config.jieba_dict}' does not exist."
                )

        # Set jieba user dictionary, if given
        if self.config.jieba_dict_user:
            path = os.path.normpath(self.config.jieba_dict_user)
            if os.path.exists(path):
                jieba.load_userdict(path)
                log.debug(f"Loading jieba user dictionary: {path}")
            else:
                log.warning(
                    f"Configuration error for 'search.jieba_dict_user': "
                    f"'{self.config.jieba_dict_user}' does not exist."
                )

    # Add page to search index
    def on_page_context(self, context, *, page, config, nav):
        self.search_index.add_entry_from_context(page)
        page.content = re.sub(
            r"\s?data-search-\w+=\"[^\"]+\"",
            "",
            page.content
        )

    # Generate search index
    def on_post_build(self, *, config):
        base = os.path.join(config.site_dir, "search")
        path = os.path.join(base, "search_index.json")

        # Generate and write search index to file
        data = self.search_index.generate_search_index(self.search_index_prev)
        utils.write_file(data.encode("utf-8"), path)

        # Persist search index for repeated invocation
        if self.is_dirty:
            self.search_index_prev = self.search_index

    # Determine whether we're running under dirty reload
    def on_serve(self, server, *, config, builder):
        self.is_dirtyreload = self.is_dirty

    # -------------------------------------------------------------------------

    # Translate the given placeholder value
    def _translate(self, config, value):
        env = config.theme.get_env()

        # Load language template and return translation for placeholder
        language = "partials/language.html"
        template = env.get_template(language, None, { "config": config })
        return template.module.t(value)

# -----------------------------------------------------------------------------

# Search index with support for additional fields
class SearchIndex:

    # Initialize search index
    def __init__(self, **config):
        self.config = config
        self.entries = []

    # Add page to search index
    def add_entry_from_context(self, page):
        search = page.meta.get("search", {})
        if search.get("exclude"):
            return

        # Divide page content into sections
        parser = Parser()
        parser.feed(page.content)
        parser.close()

        # Add sections to index
        for section in parser.data:
            if not section.is_excluded():
                self.create_entry_for_section(section, page.toc, page.url, page)

    # Override: graceful indexing and additional fields
    def create_entry_for_section(self, section, toc, url, page):
        item = self._find_toc_by_id(toc, section.id)
        if item:
            url = url + item.url
        elif section.id:
            url = url + "#" + section.id

        # Set page title as section title if none was given, which happens when
        # the first headline in a Markdown document is not a h1 headline. Also,
        # if a page title was set via front matter, use that even though a h1
        # might be given or the page name was specified in nav in mkdocs.yml
        if not section.title:
            section.title = [str(page.meta.get("title", page.title))]

        # Compute title and text
        title = "".join(section.title).strip()
        text  = "".join(section.text).strip()

        # Segment Chinese characters if jieba is available
        if jieba:
            title = self._segment_chinese(title)
            text  = self._segment_chinese(text)

        # Create entry for section
        entry = {
            "location": url,
            "title": title,
            "text": text
        }

        # Set document tags
        tags = page.meta.get("tags")
        if isinstance(tags, list):
            entry["tags"] = []
            for name in tags:
                if name and isinstance(name, (str, int, float, bool)):
                    entry["tags"].append(name)

        # Set document boost
        search = page.meta.get("search", {})
        if "boost" in search:
            entry["boost"] = search["boost"]

        # Add entry to index
        self.entries.append(entry)

    # Generate search index
    def generate_search_index(self, prev):
        config = {
            key: self.config[key]
                for key in ["lang", "separator", "pipeline"]
        }

        # Hack: if we're running under dirty reload, the search index will only
        # include the entries for the current page. However, MkDocs > 1.4 allows
        # us to persist plugin state across rebuilds, which is exactly what we
        # do by passing the previously built index to this method. Thus, we just
        # remove the previous entries for the current page, and append the new
        # entries to the end of the index, as order doesn't matter.
        if prev and self.entries:
            path = self.entries[0]["location"]

            # Since we're sure that we're running under dirty reload, the list
            # of entries will only contain sections for a single page. Thus, we
            # use the first entry to remove all entries from the previous run
            # that belong to the current page. The rationale behind this is that
            # authors might add or remove section headers, so we need to make
            # sure that sections are synchronized correctly.
            entries = [
                entry for entry in prev.entries
                    if not entry["location"].startswith(path)
            ]

            # Merge previous with current entries
            self.entries = entries + self.entries

        # Otherwise just set previous entries
        if prev and not self.entries:
            self.entries = prev.entries

        # Return search index as JSON
        data = { "config": config, "docs": self.entries }
        return json.dumps(
            data,
            separators = (",", ":"),
            default = str
        )

    # -------------------------------------------------------------------------

    # Retrieve item for anchor
    def _find_toc_by_id(self, toc, id):
        for toc_item in toc:
            if toc_item.id == id:
                return toc_item

            # Recurse into children of item
            toc_item = self._find_toc_by_id(toc_item.children, id)
            if toc_item is not None:
                return toc_item

        # No item found
        return None

    # Find and segment Chinese characters in string
    def _segment_chinese(self, data):
        expr = re.compile(r"(\p{IsHan}+)", re.UNICODE)

        # Replace callback
        def replace(match):
            value = match.group(0)

            # Replace occurrence in original string with segmented version and
            # surround with zero-width whitespace for efficient indexing
            return "".join([
                "\u200b",
                "\u200b".join(jieba.cut(value.encode("utf-8"))),
                "\u200b",
            ])

        # Return string with segmented occurrences
        return expr.sub(replace, data).strip("\u200b")

# -----------------------------------------------------------------------------

# HTML element
class Element:
    """
    An element with attributes, essentially a small wrapper object for the
    parser to access attributes in other callbacks than handle_starttag.
    """

    # Initialize HTML element
    def __init__(self, tag, attrs = dict()):
        self.tag   = tag
        self.attrs = attrs

    # String representation
    def __repr__(self):
        return self.tag

    # Support comparison (compare by tag only)
    def __eq__(self, other):
        if other is Element:
            return self.tag == other.tag
        else:
            return self.tag == other

    # Support set operations
    def __hash__(self):
        return hash(self.tag)

    # Check whether the element should be excluded
    def is_excluded(self):
        return "data-search-exclude" in self.attrs

# -----------------------------------------------------------------------------

# HTML section
class Section:
    """
    A block of text with markup, preceded by a title (with markup), i.e., a
    headline with a certain level (h1-h6). Internally used by the parser.
    """

    # Initialize HTML section
    def __init__(self, el, depth = 0):
        self.el = el
        self.depth = depth

        # Initialize section data
        self.text  = []
        self.title = []
        self.id = None

    # String representation
    def __repr__(self):
        if self.id:
            return "#".join([self.el.tag, self.id])
        else:
            return self.el.tag

    # Check whether the section should be excluded
    def is_excluded(self):
        return self.el.is_excluded()

# -----------------------------------------------------------------------------

# HTML parser
class Parser(HTMLParser):
    """
    This parser divides the given string of HTML into a list of sections, each
    of which are preceded by a h1-h6 level heading. A white- and blacklist of
    tags dictates which tags should be preserved as part of the index, and
    which should be ignored in their entirety.
    """

    # Initialize HTML parser
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Tags to skip
        self.skip = set([
            "object",                  # Objects
            "script",                  # Scripts
            "style"                    # Styles
        ])

        # Tags to keep
        self.keep = set([
            "p",                       # Paragraphs
            "code", "pre",             # Code blocks
            "li", "ol", "ul",          # Lists
            "sub", "sup"               # Sub- and superscripts
        ])

        # Current context and section
        self.context = []
        self.section = None

        # All parsed sections
        self.data = []

    # Called at the start of every HTML tag
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)

        # Ignore self-closing tags
        el = Element(tag, attrs)
        if not tag in void:
            self.context.append(el)
        else:
            return

        # Handle heading
        if tag in ([f"h{x}" for x in range(1, 7)]):
            depth = len(self.context)
            if "id" in attrs:

                # Ensure top-level section
                if tag != "h1" and not self.data:
                    self.section = Section(Element("hx"), depth)
                    self.data.append(self.section)

                # Set identifier, if not first section
                self.section = Section(el, depth)
                if self.data:
                    self.section.id = attrs["id"]

                # Append section to list
                self.data.append(self.section)

        # Handle preface - ensure top-level section
        if not self.section:
            self.section = Section(Element("hx"))
            self.data.append(self.section)

        # Handle special cases to skip
        for key, value in attrs.items():

            # Skip block if explicitly excluded from search
            if key == "data-search-exclude":
                self.skip.add(el)
                return

            # Skip line numbers - see https://bit.ly/3GvubZx
            if key == "class" and value == "linenodiv":
                self.skip.add(el)
                return

        # Render opening tag if kept
        if not self.skip.intersection(self.context):
            if tag in self.keep:

                # Check whether we're inside the section title
                data = self.section.text
                if self.section.el in self.context:
                    data = self.section.title

                # Append to section title or text
                data.append(f"<{tag}>")

    # Called at the end of every HTML tag
    def handle_endtag(self, tag):
        if not self.context or self.context[-1] != tag:
            return

        # Check whether we're exiting the current context, which happens when
        # a headline is nested in another element. In that case, we close the
        # current section, continuing to append data to the previous section,
        # which could also be a nested section – see https://bit.ly/3IxxIJZ
        if self.section.depth > len(self.context):
            for section in reversed(self.data):
                if section.depth <= len(self.context):

                    # Set depth to infinity in order to denote that the current
                    # section is exited and must never be considered again.
                    self.section.depth = float("inf")
                    self.section = section
                    break

        # Remove element from skip list
        el = self.context.pop()
        if el in self.skip:
            if el.tag not in ["script", "style", "object"]:
                self.skip.remove(el)
            return

        # Render closing tag if kept
        if not self.skip.intersection(self.context):
            if tag in self.keep:

                # Check whether we're inside the section title
                data = self.section.text
                if self.section.el in self.context:
                    data = self.section.title

                # Search for corresponding opening tag
                index = data.index(f"<{tag}>")
                for i in range(index + 1, len(data)):
                    if not data[i].isspace():
                        index = len(data)
                        break

                # Remove element if empty (or only whitespace)
                if len(data) > index:
                    while len(data) > index:
                        data.pop()

                # Append to section title or text
                else:
                    data.append(f"</{tag}>")

    # Called for the text contents of each tag
    def handle_data(self, data):
        if self.skip.intersection(self.context):
            return

        # Collapse whitespace in non-pre contexts
        if not "pre" in self.context:
            if not data.isspace():
                data = data.replace("\n", " ")
            else:
                data = " "

        # Handle preface - ensure top-level section
        if not self.section:
            self.section = Section(Element("hx"))
            self.data.append(self.section)

        # Handle section headline
        if self.section.el in self.context:
            permalink = False
            for el in self.context:
                if el.tag == "a" and el.attrs.get("class") == "headerlink":
                    permalink = True

            # Ignore permalinks
            if not permalink:
                self.section.title.append(
                    escape(data, quote = False)
                )

        # Collapse adjacent whitespace
        elif data.isspace():
            if not self.section.text or not self.section.text[-1].isspace():
                self.section.text.append(data)

        # Handle everything else
        else:
            self.section.text.append(
                escape(data, quote = False)
            )

# -----------------------------------------------------------------------------
# Data
# -----------------------------------------------------------------------------

# Set up logging
log = logging.getLogger("mkdocs.material.search")

# Tags that are self-closing
void = set([
    "area",                            # Image map areas
    "base",                            # Document base
    "br",                              # Line breaks
    "col",                             # Table columns
    "embed",                           # External content
    "hr",                              # Horizontal rules
    "img",                             # Images
    "input",                           # Input fields
    "link",                            # Links
    "meta",                            # Metadata
    "param",                           # External parameters
    "source",                          # Image source sets
    "track",                           # Text track
    "wbr"                              # Line break opportunities
])