Fixed highlighting of tags

This commit is contained in:
squidfunk 2022-12-11 15:55:12 +01:00
parent ee1496499a
commit 24a3be8f04
15 changed files with 252 additions and 190 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -211,7 +211,7 @@
"base": base_url,
"features": features,
"translations": {},
"search": "assets/javascripts/workers/search.208e55ea.min.js" | url
"search": "assets/javascripts/workers/search.f5389c75.min.js" | url
} -%}
{%- if config.extra.version -%}
{%- set _ = app.update({ "version": config.extra.version }) -%}
@ -239,13 +239,13 @@
</script>
{% endblock %}
{% block scripts %}
<script src="{{ 'assets/javascripts/bundle.f1ef77e2.min.js' | url }}"></script>
<script src="{{ 'assets/javascripts/bundle.ce0331ff.min.js' | url }}"></script>
{% for path in config.extra_javascript %}
<script src="{{ path | url }}"></script>
{% endfor %}
{% endblock %}
{% if page.meta and page.meta.ᴴₒᴴₒᴴₒ %}
<link rel="stylesheet" href="{{ 'assets/stylesheets/extra.b3906f4e.min.css' | url }}">
<link rel="stylesheet" href="{{ 'assets/stylesheets/extra.d35223bf.min.css' | url }}">
<script src="{{ 'assets/javascripts/extra/bundle.f719a234.min.js' | url }}" defer></script>
{% endif %}
</body>

View File

@ -30,6 +30,7 @@ import {
Position,
PositionTable,
highlight,
highlightAll,
tokenize
} from "../internal"
import {
@ -46,7 +47,9 @@ import {
/**
* Search item
*/
export interface SearchItem extends SearchDocument {
export interface SearchItem
extends SearchDocument
{
score: number /* Score (relevance) */
terms: SearchQueryTerms /* Search query terms */
}
@ -213,6 +216,8 @@ export class Search {
.reduce<SearchItem[]>((item, { ref, score, matchData }) => {
let doc = this.map.get(ref)
if (typeof doc !== "undefined") {
/* Shallow copy document */
doc = { ...doc }
if (doc.tags)
doc.tags = [...doc.tags]
@ -223,39 +228,29 @@ export class Search {
Object.keys(matchData.metadata)
)
// we must collect all positions for each term!
// we now take the keys of the index
/* Highlight matches in fields */
for (const field of this.index.fields) {
if (!(field in doc))
if (typeof doc[field] === "undefined")
continue
/* Collect matches */
/* Collect positions from matches */
const positions: Position[] = []
for (const match of Object.values(matchData.metadata))
if (field in match)
if (typeof match[field] !== "undefined")
positions.push(...match[field].position)
/* Skip field, if no highlighting is necessary */
/* Skip highlighting, if no positions were collected */
if (!positions.length)
continue
// @ts-expect-error - @todo fix typings
if (Array.isArray(doc[field])) {
// @ts-expect-error - @todo fix typings
for (let i = 0; i < doc[field].length; i++) {
// @ts-expect-error - @todo fix typings
doc[field][i] = highlight(doc[field][i],
this.table.get([doc.location, field].join(":"))!,
positions
)
}
} else {
// @ts-expect-error - @todo fix typings
doc[field] = highlight(doc[field],
this.table.get([doc.location, field].join(":"))!,
positions
)
}
/* Load table and determine highlighting method */
const table = this.table.get([doc.location, field].join(":"))!
const fn = Array.isArray(doc[field])
? highlightAll
: highlight
// @ts-expect-error - stop moaning, TypeScript!
doc[field] = fn(doc[field], table, positions)
}
/* Highlight title and text and apply post-query boosts */

View File

@ -41,15 +41,12 @@ type VisitorFn = (
/**
* Split a string using the given separator
*
* This function intentionally expects a visitor function argument, as opposed
* to collecting and returning all sections, for better memory efficiency.
*
* @param value - String value
* @param input - Input value
* @param separator - Separator
* @param fn - Visitor function
*/
export function split(
value: string, separator: RegExp, fn: VisitorFn
input: string, separator: RegExp, fn: VisitorFn
): void {
separator = new RegExp(separator, "g")
@ -57,10 +54,10 @@ export function split(
let match: RegExpExecArray | null
let index = 0
do {
match = separator.exec(value)
match = separator.exec(input)
/* Emit non-empty range */
const until = match?.index ?? value.length
const until = match?.index ?? input.length
if (index < until)
fn(index, until)

View File

@ -20,6 +20,24 @@
* IN THE SOFTWARE.
*/
/* ----------------------------------------------------------------------------
* Types
* ------------------------------------------------------------------------- */
/**
* Extraction type
*
* This type defines the possible values that are encoded into the first two
* bits of a section that is part of the blocks of a tokenization table. There
* are three types of interest: HTML opening and closing tags, as well as the
* actual text content we need to extract for indexing.
*/
export const enum Extract {
TAG_OPEN = 0, /* HTML opening tag */
TEXT = 1, /* Text content */
TAG_CLOSE = 2 /* HTML closing tag */
}
/* ----------------------------------------------------------------------------
* Helper types
* ------------------------------------------------------------------------- */
@ -28,12 +46,12 @@
* Visitor function
*
* @param block - Block index
* @param operation - Operation index
* @param type - Extraction type
* @param start - Start offset
* @param end - End offset
*/
type VisitorFn = (
block: number, operation: number, start: number, end: number
block: number, type: Extract, start: number, end: number
) => void
/* ----------------------------------------------------------------------------
@ -41,18 +59,18 @@ type VisitorFn = (
* ------------------------------------------------------------------------- */
/**
* Extract all non-HTML parts of a string
* Split a string into markup and text sections
*
* This function preprocesses the given string by isolating all non-HTML parts,
* in order to ensure that HTML tags are removed before indexing. Note that it
* intentionally expects a visitor function argument, as opposed to collecting
* and returning all sections, for better memory efficiency.
* This function scans a string and divides it up into sections of markup and
* text. For each section, it invokes the given visitor function with the block
* index, extraction type, as well as start and end offsets. Using a visitor
* function (= streaming data) is ideal for minimizing pressure on the GC.
*
* @param value - String value
* @param input - Input value
* @param fn - Visitor function
*/
export function extract(
value: string, fn: VisitorFn
input: string, fn: VisitorFn
): void {
let block = 0 /* Current block */
@ -60,22 +78,22 @@ export function extract(
let end = 0 /* Current end offset */
/* Split string into sections */
for (let stack = 0; end < value.length; end++) {
for (let stack = 0; end < input.length; end++) {
/* Tag start after non-empty section */
if (value.charAt(end) === "<" && end > start) {
fn(block, 1, start, start = end)
/* Opening tag after non-empty section */
if (input.charAt(end) === "<" && end > start) {
fn(block, Extract.TEXT, start, start = end)
/* Tag end */
} else if (value.charAt(end) === ">") {
if (value.charAt(start + 1) === "/") {
/* Closing tag */
} else if (input.charAt(end) === ">") {
if (input.charAt(start + 1) === "/") {
if (--stack === 0)
fn(block++, 2, start, end + 1)
fn(block++, Extract.TAG_CLOSE, start, end + 1)
/* Tag is not self-closing */
} else if (value.charAt(end - 1) !== "/") {
} else if (input.charAt(end - 1) !== "/") {
if (stack++ === 0)
fn(block, 0, start, end + 1)
fn(block, Extract.TAG_OPEN, start, end + 1)
}
/* New section */
@ -85,5 +103,5 @@ export function extract(
/* Add trailing section */
if (end > start)
fn(block, 1, start, end)
fn(block, Extract.TEXT, start, end)
}

View File

@ -25,7 +25,7 @@
* ------------------------------------------------------------------------- */
/**
* Table for indexing
* Position table
*/
export type PositionTable = number[][]
@ -46,21 +46,55 @@ export type Position = number
* when executing the query. It then highlights all occurrences, and returns
* their concatenation. In case of multiple blocks, two are returned.
*
* @param value - String value
* @param input - Input value
* @param table - Table for indexing
* @param positions - Occurrences
*
* @returns Highlighted string value
*/
export function highlight(
value: string, table: PositionTable, positions: Position[]
input: string, table: PositionTable, positions: Position[]
): string {
return highlightAll([input], table, positions).pop()!
}
/**
* Highlight all occurrences in a set of strings
*
* @param inputs - Input values
* @param table - Table for indexing
* @param positions - Occurrences
*
* @returns Highlighted string values
*/
export function highlightAll(
inputs: string[], table: PositionTable, positions: Position[]
): string[] {
/* Map blocks to input values */
const mapping = [0]
for (let t = 1; t < table.length; t++) {
const prev = table[t - 1]
const next = table[t]
/* Check if table points to new block */
const p = prev[prev.length - 1] >>> 2 & 0x3FF
const q = next[0] >>> 12
/* Add block to mapping */
mapping.push(+(p > q) + mapping[mapping.length - 1])
}
/* Highlight strings one after another */
return inputs.map((input, i) => {
/* Map occurrences to blocks */
const blocks = new Map<number, number[]>()
for (const i of positions.sort((a, b) => a - b)) {
const block = i >>> 20
const index = i & 0xFFFFF
for (const p of positions.sort((a, b) => a - b)) {
const index = p & 0xFFFFF
const block = p >>> 20
if (mapping[block] !== i)
continue
/* Ensure presence of block group */
let group = blocks.get(block)
@ -71,6 +105,10 @@ export function highlight(
group.push(index)
}
/* Just return string, if no occurrences */
if (blocks.size === 0)
return input
/* Compute slices */
const slices: string[] = []
for (const [block, indexes] of blocks) {
@ -81,18 +119,20 @@ export function highlight(
const end = t[t.length - 1] >>> 12
const length = t[t.length - 1] >>> 2 & 0x3FF
/* Extract and highlight slice/block */
let slice = value.slice(start, end + length)
for (const i of indexes.sort((a, b) => b - a)) {
/* Extract and highlight slice */
let slice = input.slice(start, end + length)
for (const j of indexes.sort((a, b) => b - a)) {
/* Retrieve offset and length of match */
const p = (t[i] >>> 12) - start
const q = (t[i] >>> 2 & 0x3FF) + p
const p = (t[j] >>> 12) - start
const q = (t[j] >>> 2 & 0x3FF) + p
/* Wrap occurrence */
slice = [
slice.slice(0, p),
"<mark>", slice.slice(p, q), "</mark>",
"<mark>",
slice.slice(p, q),
"</mark>",
slice.slice(q)
].join("")
}
@ -102,6 +142,7 @@ export function highlight(
break
}
/* Return highlighted string value */
/* Return highlighted slices */
return slices.join("")
})
}

View File

@ -21,19 +21,29 @@
*/
import { split } from "../_"
import { extract } from "../extract"
import {
Extract,
extract
} from "../extract"
/* ----------------------------------------------------------------------------
* Functions
* ------------------------------------------------------------------------- */
/**
* Split a string into tokens
* Split a string or set of strings into tokens
*
* This tokenizer supersedes the default tokenizer that is provided by Lunr.js,
* as it is aware of HTML tags and allows for multi-character splitting.
*
* @param input - String value or token
* It takes the given inputs, splits each of them into markup and text sections,
* tokenizes and segments (if necessary) each of them, and then indexes them in
* a table by using a compact bit representation. Bitwise techniques are used
* to write and read from the table during indexing and querying.
*
* @see https://bit.ly/3W3Xw4J - Search: better, faster, smaller
*
* @param input - Input value(s)
*
* @returns Tokens
*/
@ -41,67 +51,75 @@ export function tokenize(
input?: string | string[]
): lunr.Token[] {
const tokens: lunr.Token[] = []
if (typeof input === "undefined")
return tokens
/**
* Initialize segmenter, if loaded
*
* Note that doing this here is not ideal, but it's okay as we just test it
* before bringing the new search implementation in its final shape.
*/
/* Initialize segmenter, if loaded */
const segmenter = "TinySegmenter" in lunr
? new lunr.TinySegmenter()
: undefined
/* Tokenize an array of string values */
if (Array.isArray(input)) {
// @todo: handle multi-valued fields (e.g. tags)
for (const value of input)
tokens.push(...tokenize(value))
/* Tokenize a string value */
} else if (input) {
/* Tokenize strings one after another */
const inputs = Array.isArray(input) ? input : [input]
for (let i = 0; i < inputs.length; i++) {
const table = lunr.tokenizer.table
const total = table.length
/* Split string into sections and tokenize content blocks */
extract(input, (block, type, start, end) => {
if (type & 1) {
const section = input.slice(start, end)
extract(inputs[i], (block, type, start, end) => {
block += total
switch (type) {
/* Handle markup */
case Extract.TAG_OPEN:
case Extract.TAG_CLOSE:
table[block] ||= []
table[block].push(
start << 12 |
end - start << 2 |
type
)
break
/* Handle text content */
case Extract.TEXT:
const section = inputs[i].slice(start, end)
split(section, lunr.tokenizer.separator, (index, until) => {
/**
* Apply segmenter after tokenization. Note that the segmenter will
* also split words at word boundaries, which is not what we want, so
* we need to check if we can somehow mitigate this behavior.
* also split words at word boundaries, which is not what we want,
* so we need to check if we can somehow mitigate this behavior.
*/
if (typeof segmenter !== "undefined") {
const subsection = section.slice(index, until)
if (/^[MHIK]$/.test(segmenter.ctype_(subsection))) {
const segments = segmenter.segment(subsection)
for (let i = 0, l = 0; i < segments.length; i++) {
for (let s = 0, l = 0; s < segments.length; s++) {
/* Add block to table */
/* Add block to section */
table[block] ||= []
table[block].push(
start + index + l << 12 |
segments[i].length << 2 |
segments[s].length << 2 |
type
)
/* Add block as token */
/* Add token with position */
tokens.push(new lunr.Token(
segments[i].toLowerCase(), {
segments[s].toLowerCase(), {
position: block << 20 | table[block].length - 1
}
))
/* Keep track of length */
l += segments[i].length
l += segments[s].length
}
return // combine segmenter with other approach!?
return
}
}
/* Add block to table */
/* Add block to section */
table[block] ||= []
table[block].push(
start + index << 12 |
@ -109,22 +127,13 @@ export function tokenize(
type
)
/* Add block as token */
/* Add token with position */
tokens.push(new lunr.Token(
section.slice(index, until).toLowerCase(), {
position: block << 20 | table[block].length - 1
}
))
})
/* Add non-content block to table */
} else {
table[block] ||= []
table[block].push(
start << 12 |
end - start << 2 |
type
)
}
})
}

View File

@ -26,15 +26,17 @@ import lunr from "lunr"
* Global types
* ------------------------------------------------------------------------- */
type Fields = "text" | "title" | "tags"
declare global {
namespace lunr {
/**
* Index - expose inverted index
*/
interface Index {
interface Index { // this is defined in the actual inverface...
invertedIndex: Record<string, unknown>
fields: string[] // @todo: make typing generic?
fields: Fields[]
}
interface Builder {