Slight refactoring of tokenizer

This commit is contained in:
squidfunk 2022-12-18 20:51:39 +01:00
parent 8d190ef150
commit b64d0a6993
10 changed files with 27 additions and 24 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -212,7 +212,7 @@
"base": base_url, "base": base_url,
"features": features, "features": features,
"translations": {}, "translations": {},
"search": "assets/javascripts/workers/search.3de43c86.min.js" | url "search": "assets/javascripts/workers/search.d78809e2.min.js" | url
} -%} } -%}
{%- if config.extra.version -%} {%- if config.extra.version -%}
{%- set _ = app.update({ "version": config.extra.version }) -%} {%- set _ = app.update({ "version": config.extra.version }) -%}
@ -246,7 +246,7 @@
{% endfor %} {% endfor %}
{% endblock %} {% endblock %}
{% if page.meta and page.meta.ᴴₒᴴₒᴴₒ %} {% if page.meta and page.meta.ᴴₒᴴₒᴴₒ %}
<link rel="stylesheet" href="{{ 'assets/stylesheets/extra.d6bc9295.min.css' | url }}"> <link rel="stylesheet" href="{{ 'assets/stylesheets/extra.82c6347d.min.css' | url }}">
<script src="{{ 'assets/javascripts/extra/bundle.cfb3feee.min.js' | url }}" defer></script> <script src="{{ 'assets/javascripts/extra/bundle.cfb3feee.min.js' | url }}" defer></script>
{% endif %} {% endif %}
</body> </body>

View File

@ -79,7 +79,7 @@ function extractor(table: Map<string, PositionTable>) {
if (typeof doc[name] === "undefined") if (typeof doc[name] === "undefined")
return undefined return undefined
/* Compute identifier and initiable table */ /* Compute identifier and initialize table */
const id = [doc.location, name].join(":") const id = [doc.location, name].join(":")
table.set(id, lunr.tokenizer.table = []) table.set(id, lunr.tokenizer.table = [])
@ -162,6 +162,11 @@ export class Search {
this.tokenizer = tokenize as typeof lunr.tokenizer this.tokenizer = tokenize as typeof lunr.tokenizer
lunr.tokenizer.separator = new RegExp(config.separator) lunr.tokenizer.separator = new RegExp(config.separator)
/* Set up custom segmenter, if loaded */
lunr.segmenter = "TinySegmenter" in lunr
? new lunr.TinySegmenter()
: undefined
/* Compute functions to be removed from the pipeline */ /* Compute functions to be removed from the pipeline */
const fns = difference([ const fns = difference([
"trimmer", "stopWordFilter", "stemmer" "trimmer", "stopWordFilter", "stemmer"

View File

@ -54,11 +54,6 @@ export function tokenize(
if (typeof input === "undefined") if (typeof input === "undefined")
return tokens return tokens
/* Initialize segmenter, if loaded */
const segmenter = "TinySegmenter" in lunr
? new lunr.TinySegmenter()
: undefined
/* Tokenize strings one after another */ /* Tokenize strings one after another */
const inputs = Array.isArray(input) ? input : [input] const inputs = Array.isArray(input) ? input : [input]
for (let i = 0; i < inputs.length; i++) { for (let i = 0; i < inputs.length; i++) {
@ -67,13 +62,12 @@ export function tokenize(
/* Split string into sections and tokenize content blocks */ /* Split string into sections and tokenize content blocks */
extract(inputs[i], (block, type, start, end) => { extract(inputs[i], (block, type, start, end) => {
block += total table[block += total] ||= []
switch (type) { switch (type) {
/* Handle markup */ /* Handle markup */
case Extract.TAG_OPEN: case Extract.TAG_OPEN:
case Extract.TAG_CLOSE: case Extract.TAG_CLOSE:
table[block] ||= []
table[block].push( table[block].push(
start << 12 | start << 12 |
end - start << 2 | end - start << 2 |
@ -91,10 +85,10 @@ export function tokenize(
* also split words at word boundaries, which is not what we want, * also split words at word boundaries, which is not what we want,
* so we need to check if we can somehow mitigate this behavior. * so we need to check if we can somehow mitigate this behavior.
*/ */
if (typeof segmenter !== "undefined") { if (typeof lunr.segmenter !== "undefined") {
const subsection = section.slice(index, until) const subsection = section.slice(index, until)
if (/^[MHIK]$/.test(segmenter.ctype_(subsection))) { if (/^[MHIK]$/.test(lunr.segmenter.ctype_(subsection))) {
const segments = segmenter.segment(subsection) const segments = lunr.segmenter.segment(subsection)
for (let s = 0, l = 0; s < segments.length; s++) { for (let s = 0, l = 0; s < segments.length; s++) {
/* Add block to section */ /* Add block to section */
@ -120,7 +114,6 @@ export function tokenize(
} }
/* Add block to section */ /* Add block to section */
table[block] ||= []
table[block].push( table[block].push(
start + index << 12 | start + index << 12 |
until - index << 2 | until - index << 2 |

View File

@ -78,6 +78,11 @@ declare global {
let table: number[][] let table: number[][]
} }
/**
* Segmenter
*/
let segmenter: TinySegmenter | undefined
/** /**
* Lexeme type * Lexeme type
*/ */