Slight refactoring of tokenizer

This commit is contained in:
squidfunk 2022-12-18 20:51:39 +01:00
parent 8d190ef150
commit b64d0a6993
10 changed files with 27 additions and 24 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -212,7 +212,7 @@
"base": base_url,
"features": features,
"translations": {},
"search": "assets/javascripts/workers/search.3de43c86.min.js" | url
"search": "assets/javascripts/workers/search.d78809e2.min.js" | url
} -%}
{%- if config.extra.version -%}
{%- set _ = app.update({ "version": config.extra.version }) -%}
@ -246,7 +246,7 @@
{% endfor %}
{% endblock %}
{% if page.meta and page.meta.ᴴₒᴴₒᴴₒ %}
<link rel="stylesheet" href="{{ 'assets/stylesheets/extra.d6bc9295.min.css' | url }}">
<link rel="stylesheet" href="{{ 'assets/stylesheets/extra.82c6347d.min.css' | url }}">
<script src="{{ 'assets/javascripts/extra/bundle.cfb3feee.min.js' | url }}" defer></script>
{% endif %}
</body>

View File

@ -79,7 +79,7 @@ function extractor(table: Map<string, PositionTable>) {
if (typeof doc[name] === "undefined")
return undefined
/* Compute identifier and initiable table */
/* Compute identifier and initialize table */
const id = [doc.location, name].join(":")
table.set(id, lunr.tokenizer.table = [])
@ -162,6 +162,11 @@ export class Search {
this.tokenizer = tokenize as typeof lunr.tokenizer
lunr.tokenizer.separator = new RegExp(config.separator)
/* Set up custom segmenter, if loaded */
lunr.segmenter = "TinySegmenter" in lunr
? new lunr.TinySegmenter()
: undefined
/* Compute functions to be removed from the pipeline */
const fns = difference([
"trimmer", "stopWordFilter", "stemmer"

View File

@ -54,11 +54,6 @@ export function tokenize(
if (typeof input === "undefined")
return tokens
/* Initialize segmenter, if loaded */
const segmenter = "TinySegmenter" in lunr
? new lunr.TinySegmenter()
: undefined
/* Tokenize strings one after another */
const inputs = Array.isArray(input) ? input : [input]
for (let i = 0; i < inputs.length; i++) {
@ -67,13 +62,12 @@ export function tokenize(
/* Split string into sections and tokenize content blocks */
extract(inputs[i], (block, type, start, end) => {
block += total
table[block += total] ||= []
switch (type) {
/* Handle markup */
case Extract.TAG_OPEN:
case Extract.TAG_CLOSE:
table[block] ||= []
table[block].push(
start << 12 |
end - start << 2 |
@ -91,10 +85,10 @@ export function tokenize(
* also split words at word boundaries, which is not what we want,
* so we need to check if we can somehow mitigate this behavior.
*/
if (typeof segmenter !== "undefined") {
if (typeof lunr.segmenter !== "undefined") {
const subsection = section.slice(index, until)
if (/^[MHIK]$/.test(segmenter.ctype_(subsection))) {
const segments = segmenter.segment(subsection)
if (/^[MHIK]$/.test(lunr.segmenter.ctype_(subsection))) {
const segments = lunr.segmenter.segment(subsection)
for (let s = 0, l = 0; s < segments.length; s++) {
/* Add block to section */
@ -120,7 +114,6 @@ export function tokenize(
}
/* Add block to section */
table[block] ||= []
table[block].push(
start + index << 12 |
until - index << 2 |

View File

@ -78,6 +78,11 @@ declare global {
let table: number[][]
}
/**
* Segmenter
*/
let segmenter: TinySegmenter | undefined
/**
* Lexeme type
*/