Fixed highlighting of tags

2024-06-14 11:52:32 +03:00 · 2022-12-11 15:55:12 +01:00 · 2022-12-11 15:55:12 +01:00 · 24a3be8f04
commit 24a3be8f04
parent ee1496499a
15 changed files with 252 additions and 190 deletions
--- a/material/assets/javascripts/bundle.ce0331ff.min.js
+++ b/material/assets/javascripts/bundle.ce0331ff.min.js
--- a/material/assets/javascripts/bundle.ce0331ff.min.js.map
+++ b/material/assets/javascripts/bundle.ce0331ff.min.js.map
--- a/material/assets/javascripts/workers/search.f5389c75.min.js
+++ b/material/assets/javascripts/workers/search.f5389c75.min.js
--- a/material/assets/javascripts/workers/search.f5389c75.min.js.map
+++ b/material/assets/javascripts/workers/search.f5389c75.min.js.map
--- a/material/assets/stylesheets/extra.b3906f4e.min.css
+++ b/material/assets/stylesheets/extra.b3906f4e.min.css
--- a/material/assets/stylesheets/extra.b3906f4e.min.css.map
+++ b/material/assets/stylesheets/extra.b3906f4e.min.css.map
--- a/material/assets/stylesheets/extra.d35223bf.min.css
+++ b/material/assets/stylesheets/extra.d35223bf.min.css
--- a/material/assets/stylesheets/extra.d35223bf.min.css.map
+++ b/material/assets/stylesheets/extra.d35223bf.min.css.map
--- a/material/base.html
+++ b/material/base.html
@ -211,7 +211,7 @@
        "base": base_url,
        "features": features,
        "translations": {},
-        "search": "assets/javascripts/workers/search.208e55ea.min.js" | url
+        "search": "assets/javascripts/workers/search.f5389c75.min.js" | url
      } -%}
      {%- if config.extra.version -%}
        {%- set _ = app.update({ "version": config.extra.version }) -%}
@ -239,13 +239,13 @@
      </script>
    {% endblock %}
    {% block scripts %}
-      <script src="{{ 'assets/javascripts/bundle.f1ef77e2.min.js' | url }}"></script>
+      <script src="{{ 'assets/javascripts/bundle.ce0331ff.min.js' | url }}"></script>
      {% for path in config.extra_javascript %}
        <script src="{{ path | url }}"></script>
      {% endfor %}
    {% endblock %}
    {% if page.meta and page.meta.ᴴₒᴴₒᴴₒ %}
-      <link rel="stylesheet" href="{{ 'assets/stylesheets/extra.b3906f4e.min.css' | url }}">
+      <link rel="stylesheet" href="{{ 'assets/stylesheets/extra.d35223bf.min.css' | url }}">
      <script src="{{ 'assets/javascripts/extra/bundle.f719a234.min.js' | url }}" defer></script>
    {% endif %}
  </body>
--- a/src/assets/javascripts/integrations/search/_/index.ts
+++ b/src/assets/javascripts/integrations/search/_/index.ts
@ -30,6 +30,7 @@ import {
  Position,
  PositionTable,
  highlight,
+  highlightAll,
  tokenize
 } from "../internal"
 import {
@ -46,7 +47,9 @@ import {
 /**
 * Search item
 */
-export interface SearchItem extends SearchDocument {
+export interface SearchItem
+  extends SearchDocument
+{
  score: number                        /* Score (relevance) */
  terms: SearchQueryTerms              /* Search query terms */
 }
@ -213,6 +216,8 @@ export class Search {
      .reduce<SearchItem[]>((item, { ref, score, matchData }) => {
        let doc = this.map.get(ref)
        if (typeof doc !== "undefined") {
+
+          /* Shallow copy document */
          doc = { ...doc }
          if (doc.tags)
            doc.tags = [...doc.tags]
@ -223,39 +228,29 @@ export class Search {
            Object.keys(matchData.metadata)
          )

-          // we must collect all positions for each term!
-          // we now take the keys of the index
+          /* Highlight matches in fields */
          for (const field of this.index.fields) {
-            if (!(field in doc))
+            if (typeof doc[field] === "undefined")
              continue

-            /* Collect matches */
+            /* Collect positions from matches */
            const positions: Position[] = []
            for (const match of Object.values(matchData.metadata))
-              if (field in match)
+              if (typeof match[field] !== "undefined")
                positions.push(...match[field].position)

-            /* Skip field, if no highlighting is necessary */
+            /* Skip highlighting, if no positions were collected */
            if (!positions.length)
              continue

-            // @ts-expect-error - @todo fix typings
-            if (Array.isArray(doc[field])) {
-              // @ts-expect-error - @todo fix typings
-              for (let i = 0; i < doc[field].length; i++) {
-                // @ts-expect-error - @todo fix typings
-                doc[field][i] = highlight(doc[field][i],
-                  this.table.get([doc.location, field].join(":"))!,
-                  positions
-                )
-              }
-            } else {
-              // @ts-expect-error - @todo fix typings
-              doc[field] = highlight(doc[field],
-                this.table.get([doc.location, field].join(":"))!,
-                positions
-              )
-            }
+            /* Load table and determine highlighting method */
+            const table = this.table.get([doc.location, field].join(":"))!
+            const fn = Array.isArray(doc[field])
+              ? highlightAll
+              : highlight
+
+            // @ts-expect-error - stop moaning, TypeScript!
+            doc[field] = fn(doc[field], table, positions)
          }

          /* Highlight title and text and apply post-query boosts */
--- a/src/assets/javascripts/integrations/search/internal/_/index.ts
+++ b/src/assets/javascripts/integrations/search/internal/_/index.ts
@ -41,15 +41,12 @@ type VisitorFn = (
 /**
 * Split a string using the given separator
 *
- * This function intentionally expects a visitor function argument, as opposed
- * to collecting and returning all sections, for better memory efficiency.
- *
- * @param value - String value
+ * @param input - Input value
 * @param separator - Separator
 * @param fn - Visitor function
 */
 export function split(
-  value: string, separator: RegExp, fn: VisitorFn
+  input: string, separator: RegExp, fn: VisitorFn
 ): void {
  separator = new RegExp(separator, "g")

@ -57,10 +54,10 @@ export function split(
  let match: RegExpExecArray | null
  let index = 0
  do {
-    match = separator.exec(value)
+    match = separator.exec(input)

    /* Emit non-empty range */
-    const until = match?.index ?? value.length
+    const until = match?.index ?? input.length
    if (index < until)
      fn(index, until)

--- a/src/assets/javascripts/integrations/search/internal/extract/index.ts
+++ b/src/assets/javascripts/integrations/search/internal/extract/index.ts
@ -20,6 +20,24 @@
 * IN THE SOFTWARE.
 */

+/* ----------------------------------------------------------------------------
+ * Types
+ * ------------------------------------------------------------------------- */
+
+/**
+ * Extraction type
+ *
+ * This type defines the possible values that are encoded into the first two
+ * bits of a section that is part of the blocks of a tokenization table. There
+ * are three types of interest: HTML opening and closing tags, as well as the
+ * actual text content we need to extract for indexing.
+ */
+export const enum Extract {
+  TAG_OPEN  = 0,                       /* HTML opening tag */
+  TEXT      = 1,                       /* Text content */
+  TAG_CLOSE = 2                        /* HTML closing tag */
+}
+
 /* ----------------------------------------------------------------------------
 * Helper types
 * ------------------------------------------------------------------------- */
@ -28,12 +46,12 @@
 * Visitor function
 *
 * @param block - Block index
- * @param operation - Operation index
+ * @param type - Extraction type
 * @param start - Start offset
 * @param end - End offset
 */
 type VisitorFn = (
-  block: number, operation: number, start: number, end: number
+  block: number, type: Extract, start: number, end: number
 ) => void

 /* ----------------------------------------------------------------------------
@ -41,18 +59,18 @@ type VisitorFn = (
 * ------------------------------------------------------------------------- */

 /**
- * Extract all non-HTML parts of a string
+ * Split a string into markup and text sections
 *
- * This function preprocesses the given string by isolating all non-HTML parts,
- * in order to ensure that HTML tags are removed before indexing. Note that it
- * intentionally expects a visitor function argument, as opposed to collecting
- * and returning all sections, for better memory efficiency.
+ * This function scans a string and divides it up into sections of markup and
+ * text. For each section, it invokes the given visitor function with the block
+ * index, extraction type, as well as start and end offsets. Using a visitor
+ * function (= streaming data) is ideal for minimizing pressure on the GC.
 *
- * @param value - String value
+ * @param input - Input value
 * @param fn - Visitor function
 */
 export function extract(
-  value: string, fn: VisitorFn
+  input: string, fn: VisitorFn
 ): void {

  let block = 0                        /* Current block */
@ -60,22 +78,22 @@ export function extract(
  let end = 0                          /* Current end offset */

  /* Split string into sections */
-  for (let stack = 0; end < value.length; end++) {
+  for (let stack = 0; end < input.length; end++) {

-    /* Tag start after non-empty section */
-    if (value.charAt(end) === "<" && end > start) {
-      fn(block, 1, start, start = end)
+    /* Opening tag after non-empty section */
+    if (input.charAt(end) === "<" && end > start) {
+      fn(block, Extract.TEXT, start, start = end)

-      /* Tag end */
-    } else if (value.charAt(end) === ">") {
-      if (value.charAt(start + 1) === "/") {
+    /* Closing tag */
+    } else if (input.charAt(end) === ">") {
+      if (input.charAt(start + 1) === "/") {
        if (--stack === 0)
-          fn(block++, 2, start, end + 1)
+          fn(block++, Extract.TAG_CLOSE, start, end + 1)

-        /* Tag is not self-closing */
-      } else if (value.charAt(end - 1) !== "/") {
+      /* Tag is not self-closing */
+      } else if (input.charAt(end - 1) !== "/") {
        if (stack++ === 0)
-          fn(block, 0, start, end + 1)
+          fn(block, Extract.TAG_OPEN, start, end + 1)
      }

      /* New section */
@ -85,5 +103,5 @@ export function extract(

  /* Add trailing section */
  if (end > start)
-    fn(block, 1, start, end)
+    fn(block, Extract.TEXT, start, end)
 }
--- a/src/assets/javascripts/integrations/search/internal/highlight/index.ts
+++ b/src/assets/javascripts/integrations/search/internal/highlight/index.ts
@ -25,7 +25,7 @@
 * ------------------------------------------------------------------------- */

 /**
- * Table for indexing
+ * Position table
 */
 export type PositionTable = number[][]

@ -46,62 +46,103 @@ export type Position = number
 * when executing the query. It then highlights all occurrences, and returns
 * their concatenation. In case of multiple blocks, two are returned.
 *
- * @param value - String value
+ * @param input - Input value
 * @param table - Table for indexing
 * @param positions - Occurrences
 *
 * @returns Highlighted string value
 */
 export function highlight(
-  value: string, table: PositionTable, positions: Position[]
+  input: string, table: PositionTable, positions: Position[]
 ): string {
+  return highlightAll([input], table, positions).pop()!
+}

-  /* Map occurrences to blocks */
-  const blocks = new Map<number, number[]>()
-  for (const i of positions.sort((a, b) => a - b)) {
-    const block = i >>> 20
-    const index = i & 0xFFFFF
+/**
+ * Highlight all occurrences in a set of strings
+ *
+ * @param inputs - Input values
+ * @param table - Table for indexing
+ * @param positions - Occurrences
+ *
+ * @returns Highlighted string values
+ */
+export function highlightAll(
+  inputs: string[], table: PositionTable, positions: Position[]
+): string[] {

-    /* Ensure presence of block group */
-    let group = blocks.get(block)
-    if (typeof group === "undefined")
-      blocks.set(block, group = [])
+  /* Map blocks to input values */
+  const mapping = [0]
+  for (let t = 1; t < table.length; t++) {
+    const prev = table[t - 1]
+    const next = table[t]

-    /* Add index to group */
-    group.push(index)
+    /* Check if table points to new block */
+    const p = prev[prev.length - 1] >>> 2 & 0x3FF
+    const q = next[0]               >>> 12
+
+    /* Add block to mapping */
+    mapping.push(+(p > q) + mapping[mapping.length - 1])
  }

-  /* Compute slices */
-  const slices: string[] = []
-  for (const [block, indexes] of blocks) {
-    const t = table[block]
+  /* Highlight strings one after another */
+  return inputs.map((input, i) => {

-    /* Extract positions and length */
-    const start  = t[0]            >>> 12
-    const end    = t[t.length - 1] >>> 12
-    const length = t[t.length - 1] >>> 2 & 0x3FF
+    /* Map occurrences to blocks */
+    const blocks = new Map<number, number[]>()
+    for (const p of positions.sort((a, b) => a - b)) {
+      const index = p & 0xFFFFF
+      const block = p >>> 20
+      if (mapping[block] !== i)
+        continue

-    /* Extract and highlight slice/block */
-    let slice = value.slice(start, end + length)
-    for (const i of indexes.sort((a, b) => b - a)) {
+      /* Ensure presence of block group */
+      let group = blocks.get(block)
+      if (typeof group === "undefined")
+        blocks.set(block, group = [])

-      /* Retrieve offset and length of match */
-      const p = (t[i] >>> 12) - start
-      const q = (t[i] >>> 2 & 0x3FF) + p
-
-      /* Wrap occurrence */
-      slice = [
-        slice.slice(0, p),
-        "<mark>", slice.slice(p, q), "</mark>",
-        slice.slice(q)
-      ].join("")
+      /* Add index to group */
+      group.push(index)
    }

-    /* Append slice and abort if we have two */
-    if (slices.push(slice) === 2)
-      break
-  }
+    /* Just return string, if no occurrences */
+    if (blocks.size === 0)
+      return input

-  /* Return highlighted string value */
-  return slices.join("")
+    /* Compute slices */
+    const slices: string[] = []
+    for (const [block, indexes] of blocks) {
+      const t = table[block]
+
+      /* Extract positions and length */
+      const start  = t[0]            >>> 12
+      const end    = t[t.length - 1] >>> 12
+      const length = t[t.length - 1] >>> 2 & 0x3FF
+
+      /* Extract and highlight slice */
+      let slice = input.slice(start, end + length)
+      for (const j of indexes.sort((a, b) => b - a)) {
+
+        /* Retrieve offset and length of match */
+        const p = (t[j] >>> 12) - start
+        const q = (t[j] >>> 2 & 0x3FF) + p
+
+        /* Wrap occurrence */
+        slice = [
+          slice.slice(0, p),
+          "<mark>",
+          slice.slice(p, q),
+          "</mark>",
+          slice.slice(q)
+        ].join("")
+      }
+
+      /* Append slice and abort if we have two */
+      if (slices.push(slice) === 2)
+        break
+    }
+
+    /* Return highlighted slices */
+    return slices.join("")
+  })
 }
--- a/src/assets/javascripts/integrations/search/internal/tokenize/index.ts
+++ b/src/assets/javascripts/integrations/search/internal/tokenize/index.ts
@ -21,19 +21,29 @@
 */

 import { split } from "../_"
-import { extract } from "../extract"
+import {
+  Extract,
+  extract
+} from "../extract"

 /* ----------------------------------------------------------------------------
 * Functions
 * ------------------------------------------------------------------------- */

 /**
- * Split a string into tokens
+ * Split a string or set of strings into tokens
 *
 * This tokenizer supersedes the default tokenizer that is provided by Lunr.js,
 * as it is aware of HTML tags and allows for multi-character splitting.
 *
- * @param input - String value or token
+ * It takes the given inputs, splits each of them into markup and text sections,
+ * tokenizes and segments (if necessary) each of them, and then indexes them in
+ * a table by using a compact bit representation. Bitwise techniques are used
+ * to write and read from the table during indexing and querying.
+ *
+ * @see https://bit.ly/3W3Xw4J - Search: better, faster, smaller
+ *
+ * @param input - Input value(s)
 *
 * @returns Tokens
 */
@ -41,90 +51,89 @@ export function tokenize(
  input?: string | string[]
 ): lunr.Token[] {
  const tokens: lunr.Token[] = []
+  if (typeof input === "undefined")
+    return tokens

-  /**
-   * Initialize segmenter, if loaded
-   *
-   * Note that doing this here is not ideal, but it's okay as we just test it
-   * before bringing the new search implementation in its final shape.
-   */
+  /* Initialize segmenter, if loaded */
  const segmenter = "TinySegmenter" in lunr
    ? new lunr.TinySegmenter()
    : undefined

-  /* Tokenize an array of string values */
-  if (Array.isArray(input)) {
-    // @todo: handle multi-valued fields (e.g. tags)
-    for (const value of input)
-      tokens.push(...tokenize(value))
-
-  /* Tokenize a string value */
-  } else if (input) {
+  /* Tokenize strings one after another */
+  const inputs = Array.isArray(input) ? input : [input]
+  for (let i = 0; i < inputs.length; i++) {
    const table = lunr.tokenizer.table
+    const total = table.length

    /* Split string into sections and tokenize content blocks */
-    extract(input, (block, type, start, end) => {
-      if (type & 1) {
-        const section = input.slice(start, end)
-        split(section, lunr.tokenizer.separator, (index, until) => {
+    extract(inputs[i], (block, type, start, end) => {
+      block += total
+      switch (type) {

-          /**
-           * Apply segmenter after tokenization. Note that the segmenter will
-           * also split words at word boundaries, which is not what we want, so
-           * we need to check if we can somehow mitigate this behavior.
-           */
-          if (typeof segmenter !== "undefined") {
-            const subsection = section.slice(index, until)
-            if (/^[MHIK]$/.test(segmenter.ctype_(subsection))) {
-              const segments = segmenter.segment(subsection)
-              for (let i = 0, l = 0; i < segments.length; i++) {
-
-                /* Add block to table */
-                table[block] ||= []
-                table[block].push(
-                  start + index + l << 12 |
-                  segments[i].length << 2 |
-                  type
-                )
-
-                /* Add block as token */
-                tokens.push(new lunr.Token(
-                  segments[i].toLowerCase(), {
-                    position: block << 20 | table[block].length - 1
-                  }
-                ))
-
-                /* Keep track of length */
-                l += segments[i].length
-              }
-              return // combine segmenter with other approach!?
-            }
-          }
-
-          /* Add block to table */
+        /* Handle markup */
+        case Extract.TAG_OPEN:
+        case Extract.TAG_CLOSE:
          table[block] ||= []
          table[block].push(
-            start + index << 12 |
-            until - index <<  2 |
+            start       << 12 |
+            end - start <<  2 |
            type
          )
+          break

-          /* Add block as token */
-          tokens.push(new lunr.Token(
-            section.slice(index, until).toLowerCase(), {
-              position: block << 20 | table[block].length - 1
+        /* Handle text content */
+        case Extract.TEXT:
+          const section = inputs[i].slice(start, end)
+          split(section, lunr.tokenizer.separator, (index, until) => {
+
+            /**
+             * Apply segmenter after tokenization. Note that the segmenter will
+             * also split words at word boundaries, which is not what we want,
+             * so we need to check if we can somehow mitigate this behavior.
+             */
+            if (typeof segmenter !== "undefined") {
+              const subsection = section.slice(index, until)
+              if (/^[MHIK]$/.test(segmenter.ctype_(subsection))) {
+                const segments = segmenter.segment(subsection)
+                for (let s = 0, l = 0; s < segments.length; s++) {
+
+                  /* Add block to section */
+                  table[block] ||= []
+                  table[block].push(
+                    start + index + l  << 12 |
+                    segments[s].length <<  2 |
+                    type
+                  )
+
+                  /* Add token with position */
+                  tokens.push(new lunr.Token(
+                    segments[s].toLowerCase(), {
+                      position: block << 20 | table[block].length - 1
+                    }
+                  ))
+
+                  /* Keep track of length */
+                  l += segments[s].length
+                }
+                return
+              }
            }
-          ))
-        })

-      /* Add non-content block to table */
-      } else {
-        table[block] ||= []
-        table[block].push(
-          start       << 12 |
-          end - start <<  2 |
-          type
-        )
+            /* Add block to section */
+            table[block] ||= []
+            table[block].push(
+              start + index << 12 |
+              until - index <<  2 |
+              type
+            )
+
+            /* Add token with position */
+            tokens.push(new lunr.Token(
+              section.slice(index, until).toLowerCase(), {
+                position: block << 20 | table[block].length - 1
+              }
+            ))
+          })
      }
    })
  }
--- a/typings/lunr/index.d.ts
+++ b/typings/lunr/index.d.ts
@ -26,15 +26,17 @@ import lunr from "lunr"
 * Global types
 * ------------------------------------------------------------------------- */

+type Fields = "text" | "title" | "tags"
+
 declare global {
  namespace lunr {

    /**
     * Index - expose inverted index
     */
-    interface Index {
+    interface Index { // this is defined in the actual inverface...
      invertedIndex: Record<string, unknown>
-      fields: string[] // @todo: make typing generic?
+      fields: Fields[]
    }

    interface Builder {