Fixed highlighting of tags

2024-06-14 11:52:32 +03:00 · 2022-12-11 15:55:12 +01:00
parent ee1496499a
commit 24a3be8f04
15 changed files with 252 additions and 190 deletions
--- a/material/assets/javascripts/bundle.ce0331ff.min.js
+++ b/material/assets/javascripts/bundle.ce0331ff.min.js
--- a/material/assets/javascripts/bundle.ce0331ff.min.js.map
+++ b/material/assets/javascripts/bundle.ce0331ff.min.js.map
--- a/material/assets/javascripts/workers/search.f5389c75.min.js
+++ b/material/assets/javascripts/workers/search.f5389c75.min.js
--- a/material/assets/javascripts/workers/search.f5389c75.min.js.map
+++ b/material/assets/javascripts/workers/search.f5389c75.min.js.map
--- a/material/assets/stylesheets/extra.b3906f4e.min.css
+++ b/material/assets/stylesheets/extra.b3906f4e.min.css
--- a/material/assets/stylesheets/extra.b3906f4e.min.css.map
+++ b/material/assets/stylesheets/extra.b3906f4e.min.css.map
--- a/material/assets/stylesheets/extra.d35223bf.min.css
+++ b/material/assets/stylesheets/extra.d35223bf.min.css
--- a/material/assets/stylesheets/extra.d35223bf.min.css.map
+++ b/material/assets/stylesheets/extra.d35223bf.min.css.map
--- a/material/base.html
+++ b/material/base.html
@@ -211,7 +211,7 @@
        "base": base_url,
        "features": features,
        "translations": {},
-        "search": "assets/javascripts/workers/search.208e55ea.min.js" | url
+        "search": "assets/javascripts/workers/search.f5389c75.min.js" | url
      } -%}
      {%- if config.extra.version -%}
        {%- set _ = app.update({ "version": config.extra.version }) -%}
@@ -239,13 +239,13 @@
      </script>
    {% endblock %}
    {% block scripts %}
-      <script src="{{ 'assets/javascripts/bundle.f1ef77e2.min.js' | url }}"></script>
+      <script src="{{ 'assets/javascripts/bundle.ce0331ff.min.js' | url }}"></script>
      {% for path in config.extra_javascript %}
        <script src="{{ path | url }}"></script>
      {% endfor %}
    {% endblock %}
    {% if page.meta and page.meta.ᴴₒᴴₒᴴₒ %}
-      <link rel="stylesheet" href="{{ 'assets/stylesheets/extra.b3906f4e.min.css' | url }}">
+      <link rel="stylesheet" href="{{ 'assets/stylesheets/extra.d35223bf.min.css' | url }}">
      <script src="{{ 'assets/javascripts/extra/bundle.f719a234.min.js' | url }}" defer></script>
    {% endif %}
  </body>
--- a/src/assets/javascripts/integrations/search/_/index.ts
+++ b/src/assets/javascripts/integrations/search/_/index.ts
@@ -30,6 +30,7 @@ import {
  Position,
  PositionTable,
  highlight,
  highlightAll,
  tokenize
 } from "../internal"
 import {
@@ -46,7 +47,9 @@ import {
 /**
 * Search item
 */
-export interface SearchItem extends SearchDocument {
+export interface SearchItem
  extends SearchDocument
 {
  score: number                        /* Score (relevance) */
  terms: SearchQueryTerms              /* Search query terms */
 }
@@ -213,6 +216,8 @@ export class Search {
      .reduce<SearchItem[]>((item, { ref, score, matchData }) => {
        let doc = this.map.get(ref)
        if (typeof doc !== "undefined") {
          /* Shallow copy document */
          doc = { ...doc }
          if (doc.tags)
            doc.tags = [...doc.tags]
@@ -223,39 +228,29 @@ export class Search {
            Object.keys(matchData.metadata)
          )
-          // we must collect all positions for each term!
+          /* Highlight matches in fields */
          // we now take the keys of the index
          for (const field of this.index.fields) {
-            if (!(field in doc))
+            if (typeof doc[field] === "undefined")
              continue
-            /* Collect matches */
+            /* Collect positions from matches */
            const positions: Position[] = []
            for (const match of Object.values(matchData.metadata))
-              if (field in match)
+              if (typeof match[field] !== "undefined")
                positions.push(...match[field].position)
-            /* Skip field, if no highlighting is necessary */
+            /* Skip highlighting, if no positions were collected */
            if (!positions.length)
              continue
-            // @ts-expect-error - @todo fix typings
+            /* Load table and determine highlighting method */
-            if (Array.isArray(doc[field])) {
+            const table = this.table.get([doc.location, field].join(":"))!
-              // @ts-expect-error - @todo fix typings
+            const fn = Array.isArray(doc[field])
-              for (let i = 0; i < doc[field].length; i++) {
+              ? highlightAll
-                // @ts-expect-error - @todo fix typings
+              : highlight
-                doc[field][i] = highlight(doc[field][i],
+
-                  this.table.get([doc.location, field].join(":"))!,
+            // @ts-expect-error - stop moaning, TypeScript!
-                  positions
+            doc[field] = fn(doc[field], table, positions)
                )
              }
            } else {
              // @ts-expect-error - @todo fix typings
              doc[field] = highlight(doc[field],
                this.table.get([doc.location, field].join(":"))!,
                positions
              )
            }
          }
          /* Highlight title and text and apply post-query boosts */
--- a/src/assets/javascripts/integrations/search/internal/_/index.ts
+++ b/src/assets/javascripts/integrations/search/internal/_/index.ts
@@ -41,15 +41,12 @@ type VisitorFn = (
 /**
 * Split a string using the given separator
 *
- * This function intentionally expects a visitor function argument, as opposed
+ * @param input - Input value
 * to collecting and returning all sections, for better memory efficiency.
 *
 * @param value - String value
 * @param separator - Separator
 * @param fn - Visitor function
 */
 export function split(
-  value: string, separator: RegExp, fn: VisitorFn
+  input: string, separator: RegExp, fn: VisitorFn
 ): void {
  separator = new RegExp(separator, "g")
@@ -57,10 +54,10 @@ export function split(
  let match: RegExpExecArray | null
  let index = 0
  do {
-    match = separator.exec(value)
+    match = separator.exec(input)
    /* Emit non-empty range */
-    const until = match?.index ?? value.length
+    const until = match?.index ?? input.length
    if (index < until)
      fn(index, until)
--- a/src/assets/javascripts/integrations/search/internal/extract/index.ts
+++ b/src/assets/javascripts/integrations/search/internal/extract/index.ts
@@ -20,6 +20,24 @@
 * IN THE SOFTWARE.
 */
 /* ----------------------------------------------------------------------------
 * Types
 * ------------------------------------------------------------------------- */
 /**
 * Extraction type
 *
 * This type defines the possible values that are encoded into the first two
 * bits of a section that is part of the blocks of a tokenization table. There
 * are three types of interest: HTML opening and closing tags, as well as the
 * actual text content we need to extract for indexing.
 */
 export const enum Extract {
  TAG_OPEN  = 0,                       /* HTML opening tag */
  TEXT      = 1,                       /* Text content */
  TAG_CLOSE = 2                        /* HTML closing tag */
 }
 /* ----------------------------------------------------------------------------
 * Helper types
 * ------------------------------------------------------------------------- */
@@ -28,12 +46,12 @@
 * Visitor function
 *
 * @param block - Block index
- * @param operation - Operation index
+ * @param type - Extraction type
 * @param start - Start offset
 * @param end - End offset
 */
 type VisitorFn = (
-  block: number, operation: number, start: number, end: number
+  block: number, type: Extract, start: number, end: number
 ) => void
 /* ----------------------------------------------------------------------------
@@ -41,18 +59,18 @@ type VisitorFn = (
 * ------------------------------------------------------------------------- */
 /**
- * Extract all non-HTML parts of a string
+ * Split a string into markup and text sections
 *
- * This function preprocesses the given string by isolating all non-HTML parts,
+ * This function scans a string and divides it up into sections of markup and
- * in order to ensure that HTML tags are removed before indexing. Note that it
+ * text. For each section, it invokes the given visitor function with the block
- * intentionally expects a visitor function argument, as opposed to collecting
+ * index, extraction type, as well as start and end offsets. Using a visitor
- * and returning all sections, for better memory efficiency.
+ * function (= streaming data) is ideal for minimizing pressure on the GC.
 *
- * @param value - String value
+ * @param input - Input value
 * @param fn - Visitor function
 */
 export function extract(
-  value: string, fn: VisitorFn
+  input: string, fn: VisitorFn
 ): void {
  let block = 0                        /* Current block */
@@ -60,22 +78,22 @@ export function extract(
  let end = 0                          /* Current end offset */
  /* Split string into sections */
-  for (let stack = 0; end < value.length; end++) {
+  for (let stack = 0; end < input.length; end++) {
-    /* Tag start after non-empty section */
+    /* Opening tag after non-empty section */
-    if (value.charAt(end) === "<" && end > start) {
+    if (input.charAt(end) === "<" && end > start) {
-      fn(block, 1, start, start = end)
+      fn(block, Extract.TEXT, start, start = end)
-      /* Tag end */
+    /* Closing tag */
-    } else if (value.charAt(end) === ">") {
+    } else if (input.charAt(end) === ">") {
-      if (value.charAt(start + 1) === "/") {
+      if (input.charAt(start + 1) === "/") {
        if (--stack === 0)
-          fn(block++, 2, start, end + 1)
+          fn(block++, Extract.TAG_CLOSE, start, end + 1)
-        /* Tag is not self-closing */
+      /* Tag is not self-closing */
-      } else if (value.charAt(end - 1) !== "/") {
+      } else if (input.charAt(end - 1) !== "/") {
        if (stack++ === 0)
-          fn(block, 0, start, end + 1)
+          fn(block, Extract.TAG_OPEN, start, end + 1)
      }
      /* New section */
@@ -85,5 +103,5 @@ export function extract(
  /* Add trailing section */
  if (end > start)
-    fn(block, 1, start, end)
+    fn(block, Extract.TEXT, start, end)
 }
--- a/src/assets/javascripts/integrations/search/internal/highlight/index.ts
+++ b/src/assets/javascripts/integrations/search/internal/highlight/index.ts
@@ -25,7 +25,7 @@
 * ------------------------------------------------------------------------- */
 /**
- * Table for indexing
+ * Position table
 */
 export type PositionTable = number[][]
@@ -46,62 +46,103 @@ export type Position = number
 * when executing the query. It then highlights all occurrences, and returns
 * their concatenation. In case of multiple blocks, two are returned.
 *
- * @param value - String value
+ * @param input - Input value
 * @param table - Table for indexing
 * @param positions - Occurrences
 *
 * @returns Highlighted string value
 */
 export function highlight(
-  value: string, table: PositionTable, positions: Position[]
+  input: string, table: PositionTable, positions: Position[]
 ): string {
  return highlightAll([input], table, positions).pop()!
 }
-  /* Map occurrences to blocks */
+/**
-  const blocks = new Map<number, number[]>()
+ * Highlight all occurrences in a set of strings
-  for (const i of positions.sort((a, b) => a - b)) {
+ *
-    const block = i >>> 20
+ * @param inputs - Input values
-    const index = i & 0xFFFFF
+ * @param table - Table for indexing
 * @param positions - Occurrences
 *
 * @returns Highlighted string values
 */
 export function highlightAll(
  inputs: string[], table: PositionTable, positions: Position[]
 ): string[] {
-    /* Ensure presence of block group */
+  /* Map blocks to input values */
-    let group = blocks.get(block)
+  const mapping = [0]
-    if (typeof group === "undefined")
+  for (let t = 1; t < table.length; t++) {
-      blocks.set(block, group = [])
+    const prev = table[t - 1]
    const next = table[t]
-    /* Add index to group */
+    /* Check if table points to new block */
-    group.push(index)
+    const p = prev[prev.length - 1] >>> 2 & 0x3FF
    const q = next[0]               >>> 12
    /* Add block to mapping */
    mapping.push(+(p > q) + mapping[mapping.length - 1])
  }
-  /* Compute slices */
+  /* Highlight strings one after another */
-  const slices: string[] = []
+  return inputs.map((input, i) => {
  for (const [block, indexes] of blocks) {
    const t = table[block]
-    /* Extract positions and length */
+    /* Map occurrences to blocks */
-    const start  = t[0]            >>> 12
+    const blocks = new Map<number, number[]>()
-    const end    = t[t.length - 1] >>> 12
+    for (const p of positions.sort((a, b) => a - b)) {
-    const length = t[t.length - 1] >>> 2 & 0x3FF
+      const index = p & 0xFFFFF
      const block = p >>> 20
      if (mapping[block] !== i)
        continue
-    /* Extract and highlight slice/block */
+      /* Ensure presence of block group */
-    let slice = value.slice(start, end + length)
+      let group = blocks.get(block)
-    for (const i of indexes.sort((a, b) => b - a)) {
+      if (typeof group === "undefined")
        blocks.set(block, group = [])
-      /* Retrieve offset and length of match */
+      /* Add index to group */
-      const p = (t[i] >>> 12) - start
+      group.push(index)
      const q = (t[i] >>> 2 & 0x3FF) + p
      /* Wrap occurrence */
      slice = [
        slice.slice(0, p),
        "<mark>", slice.slice(p, q), "</mark>",
        slice.slice(q)
      ].join("")
    }
-    /* Append slice and abort if we have two */
+    /* Just return string, if no occurrences */
-    if (slices.push(slice) === 2)
+    if (blocks.size === 0)
-      break
+      return input
  }
-  /* Return highlighted string value */
+    /* Compute slices */
-  return slices.join("")
+    const slices: string[] = []
    for (const [block, indexes] of blocks) {
      const t = table[block]
      /* Extract positions and length */
      const start  = t[0]            >>> 12
      const end    = t[t.length - 1] >>> 12
      const length = t[t.length - 1] >>> 2 & 0x3FF
      /* Extract and highlight slice */
      let slice = input.slice(start, end + length)
      for (const j of indexes.sort((a, b) => b - a)) {
        /* Retrieve offset and length of match */
        const p = (t[j] >>> 12) - start
        const q = (t[j] >>> 2 & 0x3FF) + p
        /* Wrap occurrence */
        slice = [
          slice.slice(0, p),
          "<mark>",
          slice.slice(p, q),
          "</mark>",
          slice.slice(q)
        ].join("")
      }
      /* Append slice and abort if we have two */
      if (slices.push(slice) === 2)
        break
    }
    /* Return highlighted slices */
    return slices.join("")
  })
 }
--- a/src/assets/javascripts/integrations/search/internal/tokenize/index.ts
+++ b/src/assets/javascripts/integrations/search/internal/tokenize/index.ts
@@ -21,19 +21,29 @@
 */
 import { split } from "../_"
-import { extract } from "../extract"
+import {
  Extract,
  extract
 } from "../extract"
 /* ----------------------------------------------------------------------------
 * Functions
 * ------------------------------------------------------------------------- */
 /**
- * Split a string into tokens
+ * Split a string or set of strings into tokens
 *
 * This tokenizer supersedes the default tokenizer that is provided by Lunr.js,
 * as it is aware of HTML tags and allows for multi-character splitting.
 *
- * @param input - String value or token
+ * It takes the given inputs, splits each of them into markup and text sections,
 * tokenizes and segments (if necessary) each of them, and then indexes them in
 * a table by using a compact bit representation. Bitwise techniques are used
 * to write and read from the table during indexing and querying.
 *
 * @see https://bit.ly/3W3Xw4J - Search: better, faster, smaller
 *
 * @param input - Input value(s)
 *
 * @returns Tokens
 */
@@ -41,90 +51,89 @@ export function tokenize(
  input?: string | string[]
 ): lunr.Token[] {
  const tokens: lunr.Token[] = []
  if (typeof input === "undefined")
    return tokens
-  /**
+  /* Initialize segmenter, if loaded */
   * Initialize segmenter, if loaded
   *
   * Note that doing this here is not ideal, but it's okay as we just test it
   * before bringing the new search implementation in its final shape.
   */
  const segmenter = "TinySegmenter" in lunr
    ? new lunr.TinySegmenter()
    : undefined
-  /* Tokenize an array of string values */
+  /* Tokenize strings one after another */
-  if (Array.isArray(input)) {
+  const inputs = Array.isArray(input) ? input : [input]
-    // @todo: handle multi-valued fields (e.g. tags)
+  for (let i = 0; i < inputs.length; i++) {
    for (const value of input)
      tokens.push(...tokenize(value))
  /* Tokenize a string value */
  } else if (input) {
    const table = lunr.tokenizer.table
    const total = table.length
    /* Split string into sections and tokenize content blocks */
-    extract(input, (block, type, start, end) => {
+    extract(inputs[i], (block, type, start, end) => {
-      if (type & 1) {
+      block += total
-        const section = input.slice(start, end)
+      switch (type) {
        split(section, lunr.tokenizer.separator, (index, until) => {
-          /**
+        /* Handle markup */
-           * Apply segmenter after tokenization. Note that the segmenter will
+        case Extract.TAG_OPEN:
-           * also split words at word boundaries, which is not what we want, so
+        case Extract.TAG_CLOSE:
           * we need to check if we can somehow mitigate this behavior.
           */
          if (typeof segmenter !== "undefined") {
            const subsection = section.slice(index, until)
            if (/^[MHIK]$/.test(segmenter.ctype_(subsection))) {
              const segments = segmenter.segment(subsection)
              for (let i = 0, l = 0; i < segments.length; i++) {
                /* Add block to table */
                table[block] ||= []
                table[block].push(
                  start + index + l << 12 |
                  segments[i].length << 2 |
                  type
                )
                /* Add block as token */
                tokens.push(new lunr.Token(
                  segments[i].toLowerCase(), {
                    position: block << 20 | table[block].length - 1
                  }
                ))
                /* Keep track of length */
                l += segments[i].length
              }
              return // combine segmenter with other approach!?
            }
          }
          /* Add block to table */
          table[block] ||= []
          table[block].push(
-            start + index << 12 |
+            start       << 12 |
-            until - index <<  2 |
+            end - start <<  2 |
            type
          )
          break
-          /* Add block as token */
+        /* Handle text content */
-          tokens.push(new lunr.Token(
+        case Extract.TEXT:
-            section.slice(index, until).toLowerCase(), {
+          const section = inputs[i].slice(start, end)
-              position: block << 20 | table[block].length - 1
+          split(section, lunr.tokenizer.separator, (index, until) => {
            /**
             * Apply segmenter after tokenization. Note that the segmenter will
             * also split words at word boundaries, which is not what we want,
             * so we need to check if we can somehow mitigate this behavior.
             */
            if (typeof segmenter !== "undefined") {
              const subsection = section.slice(index, until)
              if (/^[MHIK]$/.test(segmenter.ctype_(subsection))) {
                const segments = segmenter.segment(subsection)
                for (let s = 0, l = 0; s < segments.length; s++) {
                  /* Add block to section */
                  table[block] ||= []
                  table[block].push(
                    start + index + l  << 12 |
                    segments[s].length <<  2 |
                    type
                  )
                  /* Add token with position */
                  tokens.push(new lunr.Token(
                    segments[s].toLowerCase(), {
                      position: block << 20 | table[block].length - 1
                    }
                  ))
                  /* Keep track of length */
                  l += segments[s].length
                }
                return
              }
            }
          ))
        })
-      /* Add non-content block to table */
+            /* Add block to section */
-      } else {
+            table[block] ||= []
-        table[block] ||= []
+            table[block].push(
-        table[block].push(
+              start + index << 12 |
-          start       << 12 |
+              until - index <<  2 |
-          end - start <<  2 |
+              type
-          type
+            )
-        )
+
            /* Add token with position */
            tokens.push(new lunr.Token(
              section.slice(index, until).toLowerCase(), {
                position: block << 20 | table[block].length - 1
              }
            ))
          })
      }
    })
  }
--- a/typings/lunr/index.d.ts
+++ b/typings/lunr/index.d.ts
@@ -26,15 +26,17 @@ import lunr from "lunr"
 * Global types
 * ------------------------------------------------------------------------- */
 type Fields = "text" | "title" | "tags"
 declare global {
  namespace lunr {
    /**
     * Index - expose inverted index
     */
-    interface Index {
+    interface Index { // this is defined in the actual inverface...
      invertedIndex: Record<string, unknown>
-      fields: string[] // @todo: make typing generic?
+      fields: Fields[]
    }
    interface Builder {