Fixed highlighting of tags

This commit is contained in:
squidfunk
2022-12-11 15:55:12 +01:00
parent ee1496499a
commit 24a3be8f04
15 changed files with 252 additions and 190 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -211,7 +211,7 @@
"base": base_url, "base": base_url,
"features": features, "features": features,
"translations": {}, "translations": {},
"search": "assets/javascripts/workers/search.208e55ea.min.js" | url "search": "assets/javascripts/workers/search.f5389c75.min.js" | url
} -%} } -%}
{%- if config.extra.version -%} {%- if config.extra.version -%}
{%- set _ = app.update({ "version": config.extra.version }) -%} {%- set _ = app.update({ "version": config.extra.version }) -%}
@@ -239,13 +239,13 @@
</script> </script>
{% endblock %} {% endblock %}
{% block scripts %} {% block scripts %}
<script src="{{ 'assets/javascripts/bundle.f1ef77e2.min.js' | url }}"></script> <script src="{{ 'assets/javascripts/bundle.ce0331ff.min.js' | url }}"></script>
{% for path in config.extra_javascript %} {% for path in config.extra_javascript %}
<script src="{{ path | url }}"></script> <script src="{{ path | url }}"></script>
{% endfor %} {% endfor %}
{% endblock %} {% endblock %}
{% if page.meta and page.meta.ᴴₒᴴₒᴴₒ %} {% if page.meta and page.meta.ᴴₒᴴₒᴴₒ %}
<link rel="stylesheet" href="{{ 'assets/stylesheets/extra.b3906f4e.min.css' | url }}"> <link rel="stylesheet" href="{{ 'assets/stylesheets/extra.d35223bf.min.css' | url }}">
<script src="{{ 'assets/javascripts/extra/bundle.f719a234.min.js' | url }}" defer></script> <script src="{{ 'assets/javascripts/extra/bundle.f719a234.min.js' | url }}" defer></script>
{% endif %} {% endif %}
</body> </body>

View File

@@ -30,6 +30,7 @@ import {
Position, Position,
PositionTable, PositionTable,
highlight, highlight,
highlightAll,
tokenize tokenize
} from "../internal" } from "../internal"
import { import {
@@ -46,7 +47,9 @@ import {
/** /**
* Search item * Search item
*/ */
export interface SearchItem extends SearchDocument { export interface SearchItem
extends SearchDocument
{
score: number /* Score (relevance) */ score: number /* Score (relevance) */
terms: SearchQueryTerms /* Search query terms */ terms: SearchQueryTerms /* Search query terms */
} }
@@ -213,6 +216,8 @@ export class Search {
.reduce<SearchItem[]>((item, { ref, score, matchData }) => { .reduce<SearchItem[]>((item, { ref, score, matchData }) => {
let doc = this.map.get(ref) let doc = this.map.get(ref)
if (typeof doc !== "undefined") { if (typeof doc !== "undefined") {
/* Shallow copy document */
doc = { ...doc } doc = { ...doc }
if (doc.tags) if (doc.tags)
doc.tags = [...doc.tags] doc.tags = [...doc.tags]
@@ -223,39 +228,29 @@ export class Search {
Object.keys(matchData.metadata) Object.keys(matchData.metadata)
) )
// we must collect all positions for each term! /* Highlight matches in fields */
// we now take the keys of the index
for (const field of this.index.fields) { for (const field of this.index.fields) {
if (!(field in doc)) if (typeof doc[field] === "undefined")
continue continue
/* Collect matches */ /* Collect positions from matches */
const positions: Position[] = [] const positions: Position[] = []
for (const match of Object.values(matchData.metadata)) for (const match of Object.values(matchData.metadata))
if (field in match) if (typeof match[field] !== "undefined")
positions.push(...match[field].position) positions.push(...match[field].position)
/* Skip field, if no highlighting is necessary */ /* Skip highlighting, if no positions were collected */
if (!positions.length) if (!positions.length)
continue continue
// @ts-expect-error - @todo fix typings /* Load table and determine highlighting method */
if (Array.isArray(doc[field])) { const table = this.table.get([doc.location, field].join(":"))!
// @ts-expect-error - @todo fix typings const fn = Array.isArray(doc[field])
for (let i = 0; i < doc[field].length; i++) { ? highlightAll
// @ts-expect-error - @todo fix typings : highlight
doc[field][i] = highlight(doc[field][i],
this.table.get([doc.location, field].join(":"))!, // @ts-expect-error - stop moaning, TypeScript!
positions doc[field] = fn(doc[field], table, positions)
)
}
} else {
// @ts-expect-error - @todo fix typings
doc[field] = highlight(doc[field],
this.table.get([doc.location, field].join(":"))!,
positions
)
}
} }
/* Highlight title and text and apply post-query boosts */ /* Highlight title and text and apply post-query boosts */

View File

@@ -41,15 +41,12 @@ type VisitorFn = (
/** /**
* Split a string using the given separator * Split a string using the given separator
* *
* This function intentionally expects a visitor function argument, as opposed * @param input - Input value
* to collecting and returning all sections, for better memory efficiency.
*
* @param value - String value
* @param separator - Separator * @param separator - Separator
* @param fn - Visitor function * @param fn - Visitor function
*/ */
export function split( export function split(
value: string, separator: RegExp, fn: VisitorFn input: string, separator: RegExp, fn: VisitorFn
): void { ): void {
separator = new RegExp(separator, "g") separator = new RegExp(separator, "g")
@@ -57,10 +54,10 @@ export function split(
let match: RegExpExecArray | null let match: RegExpExecArray | null
let index = 0 let index = 0
do { do {
match = separator.exec(value) match = separator.exec(input)
/* Emit non-empty range */ /* Emit non-empty range */
const until = match?.index ?? value.length const until = match?.index ?? input.length
if (index < until) if (index < until)
fn(index, until) fn(index, until)

View File

@@ -20,6 +20,24 @@
* IN THE SOFTWARE. * IN THE SOFTWARE.
*/ */
/* ----------------------------------------------------------------------------
* Types
* ------------------------------------------------------------------------- */
/**
* Extraction type
*
* This type defines the possible values that are encoded into the first two
* bits of a section that is part of the blocks of a tokenization table. There
* are three types of interest: HTML opening and closing tags, as well as the
* actual text content we need to extract for indexing.
*/
export const enum Extract {
TAG_OPEN = 0, /* HTML opening tag */
TEXT = 1, /* Text content */
TAG_CLOSE = 2 /* HTML closing tag */
}
/* ---------------------------------------------------------------------------- /* ----------------------------------------------------------------------------
* Helper types * Helper types
* ------------------------------------------------------------------------- */ * ------------------------------------------------------------------------- */
@@ -28,12 +46,12 @@
* Visitor function * Visitor function
* *
* @param block - Block index * @param block - Block index
* @param operation - Operation index * @param type - Extraction type
* @param start - Start offset * @param start - Start offset
* @param end - End offset * @param end - End offset
*/ */
type VisitorFn = ( type VisitorFn = (
block: number, operation: number, start: number, end: number block: number, type: Extract, start: number, end: number
) => void ) => void
/* ---------------------------------------------------------------------------- /* ----------------------------------------------------------------------------
@@ -41,18 +59,18 @@ type VisitorFn = (
* ------------------------------------------------------------------------- */ * ------------------------------------------------------------------------- */
/** /**
* Extract all non-HTML parts of a string * Split a string into markup and text sections
* *
* This function preprocesses the given string by isolating all non-HTML parts, * This function scans a string and divides it up into sections of markup and
* in order to ensure that HTML tags are removed before indexing. Note that it * text. For each section, it invokes the given visitor function with the block
* intentionally expects a visitor function argument, as opposed to collecting * index, extraction type, as well as start and end offsets. Using a visitor
* and returning all sections, for better memory efficiency. * function (= streaming data) is ideal for minimizing pressure on the GC.
* *
* @param value - String value * @param input - Input value
* @param fn - Visitor function * @param fn - Visitor function
*/ */
export function extract( export function extract(
value: string, fn: VisitorFn input: string, fn: VisitorFn
): void { ): void {
let block = 0 /* Current block */ let block = 0 /* Current block */
@@ -60,22 +78,22 @@ export function extract(
let end = 0 /* Current end offset */ let end = 0 /* Current end offset */
/* Split string into sections */ /* Split string into sections */
for (let stack = 0; end < value.length; end++) { for (let stack = 0; end < input.length; end++) {
/* Tag start after non-empty section */ /* Opening tag after non-empty section */
if (value.charAt(end) === "<" && end > start) { if (input.charAt(end) === "<" && end > start) {
fn(block, 1, start, start = end) fn(block, Extract.TEXT, start, start = end)
/* Tag end */ /* Closing tag */
} else if (value.charAt(end) === ">") { } else if (input.charAt(end) === ">") {
if (value.charAt(start + 1) === "/") { if (input.charAt(start + 1) === "/") {
if (--stack === 0) if (--stack === 0)
fn(block++, 2, start, end + 1) fn(block++, Extract.TAG_CLOSE, start, end + 1)
/* Tag is not self-closing */ /* Tag is not self-closing */
} else if (value.charAt(end - 1) !== "/") { } else if (input.charAt(end - 1) !== "/") {
if (stack++ === 0) if (stack++ === 0)
fn(block, 0, start, end + 1) fn(block, Extract.TAG_OPEN, start, end + 1)
} }
/* New section */ /* New section */
@@ -85,5 +103,5 @@ export function extract(
/* Add trailing section */ /* Add trailing section */
if (end > start) if (end > start)
fn(block, 1, start, end) fn(block, Extract.TEXT, start, end)
} }

View File

@@ -25,7 +25,7 @@
* ------------------------------------------------------------------------- */ * ------------------------------------------------------------------------- */
/** /**
* Table for indexing * Position table
*/ */
export type PositionTable = number[][] export type PositionTable = number[][]
@@ -46,62 +46,103 @@ export type Position = number
* when executing the query. It then highlights all occurrences, and returns * when executing the query. It then highlights all occurrences, and returns
* their concatenation. In case of multiple blocks, two are returned. * their concatenation. In case of multiple blocks, two are returned.
* *
* @param value - String value * @param input - Input value
* @param table - Table for indexing * @param table - Table for indexing
* @param positions - Occurrences * @param positions - Occurrences
* *
* @returns Highlighted string value * @returns Highlighted string value
*/ */
export function highlight( export function highlight(
value: string, table: PositionTable, positions: Position[] input: string, table: PositionTable, positions: Position[]
): string { ): string {
return highlightAll([input], table, positions).pop()!
}
/* Map occurrences to blocks */ /**
const blocks = new Map<number, number[]>() * Highlight all occurrences in a set of strings
for (const i of positions.sort((a, b) => a - b)) { *
const block = i >>> 20 * @param inputs - Input values
const index = i & 0xFFFFF * @param table - Table for indexing
* @param positions - Occurrences
*
* @returns Highlighted string values
*/
export function highlightAll(
inputs: string[], table: PositionTable, positions: Position[]
): string[] {
/* Ensure presence of block group */ /* Map blocks to input values */
let group = blocks.get(block) const mapping = [0]
if (typeof group === "undefined") for (let t = 1; t < table.length; t++) {
blocks.set(block, group = []) const prev = table[t - 1]
const next = table[t]
/* Add index to group */ /* Check if table points to new block */
group.push(index) const p = prev[prev.length - 1] >>> 2 & 0x3FF
const q = next[0] >>> 12
/* Add block to mapping */
mapping.push(+(p > q) + mapping[mapping.length - 1])
} }
/* Compute slices */ /* Highlight strings one after another */
const slices: string[] = [] return inputs.map((input, i) => {
for (const [block, indexes] of blocks) {
const t = table[block]
/* Extract positions and length */ /* Map occurrences to blocks */
const start = t[0] >>> 12 const blocks = new Map<number, number[]>()
const end = t[t.length - 1] >>> 12 for (const p of positions.sort((a, b) => a - b)) {
const length = t[t.length - 1] >>> 2 & 0x3FF const index = p & 0xFFFFF
const block = p >>> 20
if (mapping[block] !== i)
continue
/* Extract and highlight slice/block */ /* Ensure presence of block group */
let slice = value.slice(start, end + length) let group = blocks.get(block)
for (const i of indexes.sort((a, b) => b - a)) { if (typeof group === "undefined")
blocks.set(block, group = [])
/* Retrieve offset and length of match */ /* Add index to group */
const p = (t[i] >>> 12) - start group.push(index)
const q = (t[i] >>> 2 & 0x3FF) + p
/* Wrap occurrence */
slice = [
slice.slice(0, p),
"<mark>", slice.slice(p, q), "</mark>",
slice.slice(q)
].join("")
} }
/* Append slice and abort if we have two */ /* Just return string, if no occurrences */
if (slices.push(slice) === 2) if (blocks.size === 0)
break return input
}
/* Return highlighted string value */ /* Compute slices */
return slices.join("") const slices: string[] = []
for (const [block, indexes] of blocks) {
const t = table[block]
/* Extract positions and length */
const start = t[0] >>> 12
const end = t[t.length - 1] >>> 12
const length = t[t.length - 1] >>> 2 & 0x3FF
/* Extract and highlight slice */
let slice = input.slice(start, end + length)
for (const j of indexes.sort((a, b) => b - a)) {
/* Retrieve offset and length of match */
const p = (t[j] >>> 12) - start
const q = (t[j] >>> 2 & 0x3FF) + p
/* Wrap occurrence */
slice = [
slice.slice(0, p),
"<mark>",
slice.slice(p, q),
"</mark>",
slice.slice(q)
].join("")
}
/* Append slice and abort if we have two */
if (slices.push(slice) === 2)
break
}
/* Return highlighted slices */
return slices.join("")
})
} }

View File

@@ -21,19 +21,29 @@
*/ */
import { split } from "../_" import { split } from "../_"
import { extract } from "../extract" import {
Extract,
extract
} from "../extract"
/* ---------------------------------------------------------------------------- /* ----------------------------------------------------------------------------
* Functions * Functions
* ------------------------------------------------------------------------- */ * ------------------------------------------------------------------------- */
/** /**
* Split a string into tokens * Split a string or set of strings into tokens
* *
* This tokenizer supersedes the default tokenizer that is provided by Lunr.js, * This tokenizer supersedes the default tokenizer that is provided by Lunr.js,
* as it is aware of HTML tags and allows for multi-character splitting. * as it is aware of HTML tags and allows for multi-character splitting.
* *
* @param input - String value or token * It takes the given inputs, splits each of them into markup and text sections,
* tokenizes and segments (if necessary) each of them, and then indexes them in
* a table by using a compact bit representation. Bitwise techniques are used
* to write and read from the table during indexing and querying.
*
* @see https://bit.ly/3W3Xw4J - Search: better, faster, smaller
*
* @param input - Input value(s)
* *
* @returns Tokens * @returns Tokens
*/ */
@@ -41,90 +51,89 @@ export function tokenize(
input?: string | string[] input?: string | string[]
): lunr.Token[] { ): lunr.Token[] {
const tokens: lunr.Token[] = [] const tokens: lunr.Token[] = []
if (typeof input === "undefined")
return tokens
/** /* Initialize segmenter, if loaded */
* Initialize segmenter, if loaded
*
* Note that doing this here is not ideal, but it's okay as we just test it
* before bringing the new search implementation in its final shape.
*/
const segmenter = "TinySegmenter" in lunr const segmenter = "TinySegmenter" in lunr
? new lunr.TinySegmenter() ? new lunr.TinySegmenter()
: undefined : undefined
/* Tokenize an array of string values */ /* Tokenize strings one after another */
if (Array.isArray(input)) { const inputs = Array.isArray(input) ? input : [input]
// @todo: handle multi-valued fields (e.g. tags) for (let i = 0; i < inputs.length; i++) {
for (const value of input)
tokens.push(...tokenize(value))
/* Tokenize a string value */
} else if (input) {
const table = lunr.tokenizer.table const table = lunr.tokenizer.table
const total = table.length
/* Split string into sections and tokenize content blocks */ /* Split string into sections and tokenize content blocks */
extract(input, (block, type, start, end) => { extract(inputs[i], (block, type, start, end) => {
if (type & 1) { block += total
const section = input.slice(start, end) switch (type) {
split(section, lunr.tokenizer.separator, (index, until) => {
/** /* Handle markup */
* Apply segmenter after tokenization. Note that the segmenter will case Extract.TAG_OPEN:
* also split words at word boundaries, which is not what we want, so case Extract.TAG_CLOSE:
* we need to check if we can somehow mitigate this behavior.
*/
if (typeof segmenter !== "undefined") {
const subsection = section.slice(index, until)
if (/^[MHIK]$/.test(segmenter.ctype_(subsection))) {
const segments = segmenter.segment(subsection)
for (let i = 0, l = 0; i < segments.length; i++) {
/* Add block to table */
table[block] ||= []
table[block].push(
start + index + l << 12 |
segments[i].length << 2 |
type
)
/* Add block as token */
tokens.push(new lunr.Token(
segments[i].toLowerCase(), {
position: block << 20 | table[block].length - 1
}
))
/* Keep track of length */
l += segments[i].length
}
return // combine segmenter with other approach!?
}
}
/* Add block to table */
table[block] ||= [] table[block] ||= []
table[block].push( table[block].push(
start + index << 12 | start << 12 |
until - index << 2 | end - start << 2 |
type type
) )
break
/* Add block as token */ /* Handle text content */
tokens.push(new lunr.Token( case Extract.TEXT:
section.slice(index, until).toLowerCase(), { const section = inputs[i].slice(start, end)
position: block << 20 | table[block].length - 1 split(section, lunr.tokenizer.separator, (index, until) => {
/**
* Apply segmenter after tokenization. Note that the segmenter will
* also split words at word boundaries, which is not what we want,
* so we need to check if we can somehow mitigate this behavior.
*/
if (typeof segmenter !== "undefined") {
const subsection = section.slice(index, until)
if (/^[MHIK]$/.test(segmenter.ctype_(subsection))) {
const segments = segmenter.segment(subsection)
for (let s = 0, l = 0; s < segments.length; s++) {
/* Add block to section */
table[block] ||= []
table[block].push(
start + index + l << 12 |
segments[s].length << 2 |
type
)
/* Add token with position */
tokens.push(new lunr.Token(
segments[s].toLowerCase(), {
position: block << 20 | table[block].length - 1
}
))
/* Keep track of length */
l += segments[s].length
}
return
}
} }
))
})
/* Add non-content block to table */ /* Add block to section */
} else { table[block] ||= []
table[block] ||= [] table[block].push(
table[block].push( start + index << 12 |
start << 12 | until - index << 2 |
end - start << 2 | type
type )
)
/* Add token with position */
tokens.push(new lunr.Token(
section.slice(index, until).toLowerCase(), {
position: block << 20 | table[block].length - 1
}
))
})
} }
}) })
} }

View File

@@ -26,15 +26,17 @@ import lunr from "lunr"
* Global types * Global types
* ------------------------------------------------------------------------- */ * ------------------------------------------------------------------------- */
type Fields = "text" | "title" | "tags"
declare global { declare global {
namespace lunr { namespace lunr {
/** /**
* Index - expose inverted index * Index - expose inverted index
*/ */
interface Index { interface Index { // this is defined in the actual inverface...
invertedIndex: Record<string, unknown> invertedIndex: Record<string, unknown>
fields: string[] // @todo: make typing generic? fields: Fields[]
} }
interface Builder { interface Builder {