mirror of
https://github.com/BookStackApp/BookStack.git
synced 2025-03-26 08:58:47 +00:00

This changes indexing so that a.b now indexes as "a", "b" AND "a.b" instead of just the first two, for periods and hypens, so terms containing those characters can be searched within. Adds hypens as a delimiter - #2095
70 lines
1.8 KiB
PHP
70 lines
1.8 KiB
PHP
<?php
|
|
|
|
namespace BookStack\Search;
|
|
|
|
/**
|
|
* A custom text tokenizer which records & provides insight needed for our search indexing.
|
|
* We used to use basic strtok() but this class does the following which that lacked:
|
|
* - Tracks and provides the current/previous delimiter that we've stopped at.
|
|
* - Returns empty tokens upon parsing a delimiter.
|
|
*/
|
|
class SearchTextTokenizer
|
|
{
|
|
protected int $currentIndex = 0;
|
|
protected int $length;
|
|
protected string $currentDelimiter = '';
|
|
protected string $previousDelimiter = '';
|
|
|
|
public function __construct(
|
|
protected string $text,
|
|
protected string $delimiters = ' '
|
|
) {
|
|
$this->length = strlen($this->text);
|
|
}
|
|
|
|
/**
|
|
* Get the current delimiter to be found.
|
|
*/
|
|
public function currentDelimiter(): string
|
|
{
|
|
return $this->currentDelimiter;
|
|
}
|
|
|
|
/**
|
|
* Get the previous delimiter found.
|
|
*/
|
|
public function previousDelimiter(): string
|
|
{
|
|
return $this->previousDelimiter;
|
|
}
|
|
|
|
/**
|
|
* Get the next token between delimiters.
|
|
* Returns false if there's no further tokens.
|
|
*/
|
|
public function next(): string|false
|
|
{
|
|
$token = '';
|
|
|
|
for ($i = $this->currentIndex; $i < $this->length; $i++) {
|
|
$char = $this->text[$i];
|
|
if (str_contains($this->delimiters, $char)) {
|
|
$this->previousDelimiter = $this->currentDelimiter;
|
|
$this->currentDelimiter = $char;
|
|
$this->currentIndex = $i + 1;
|
|
return $token;
|
|
}
|
|
|
|
$token .= $char;
|
|
}
|
|
|
|
if ($token) {
|
|
$this->currentIndex = $this->length;
|
|
$this->previousDelimiter = $this->currentDelimiter;
|
|
$this->currentDelimiter = '';
|
|
return $token;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
}
|