BookStackApp_BookStack/app/Search/SearchIndex.php
Dan Brown 509af2463d
Search Index: Fixed SQL error when indexing large pages
Due to hitting statement placeholder limits (typically 65k)
when inserting index terms for single page.

Added test to cover.
Also added skipped tests for tests we don't always want to run.
For #5322
2024-12-11 15:55:19 +00:00

267 lines
8 KiB
PHP

<?php
namespace BookStack\Search;
use BookStack\Activity\Models\Tag;
use BookStack\Entities\EntityProvider;
use BookStack\Entities\Models\Entity;
use BookStack\Entities\Models\Page;
use BookStack\Util\HtmlDocument;
use DOMNode;
use Illuminate\Database\Eloquent\Builder;
use Illuminate\Support\Collection;
class SearchIndex
{
/**
* A list of delimiter characters used to break-up parsed content into terms for indexing.
*/
public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\"";
public function __construct(
protected EntityProvider $entityProvider
) {
}
/**
* Index the given entity.
*/
public function indexEntity(Entity $entity): void
{
$this->deleteEntityTerms($entity);
$terms = $this->entityToTermDataArray($entity);
$this->insertTerms($terms);
}
/**
* Index multiple Entities at once.
*
* @param Entity[] $entities
*/
public function indexEntities(array $entities): void
{
$terms = [];
foreach ($entities as $entity) {
$entityTerms = $this->entityToTermDataArray($entity);
array_push($terms, ...$entityTerms);
}
$this->insertTerms($terms);
}
/**
* Delete and re-index the terms for all entities in the system.
* Can take a callback which is used for reporting progress.
* Callback receives three arguments:
* - An instance of the model being processed
* - The number that have been processed so far.
* - The total number of that model to be processed.
*
* @param callable(Entity, int, int):void|null $progressCallback
*/
public function indexAllEntities(?callable $progressCallback = null): void
{
SearchTerm::query()->truncate();
foreach ($this->entityProvider->all() as $entityModel) {
$indexContentField = $entityModel instanceof Page ? 'html' : 'description';
$selectFields = ['id', 'name', $indexContentField];
/** @var Builder<Entity> $query */
$query = $entityModel->newQuery();
$total = $query->withTrashed()->count();
$chunkSize = 250;
$processed = 0;
$chunkCallback = function (Collection $entities) use ($progressCallback, &$processed, $total, $chunkSize, $entityModel) {
$this->indexEntities($entities->all());
$processed = min($processed + $chunkSize, $total);
if (is_callable($progressCallback)) {
$progressCallback($entityModel, $processed, $total);
}
};
$entityModel->newQuery()
->select($selectFields)
->with(['tags:id,name,value,entity_id,entity_type'])
->chunk($chunkSize, $chunkCallback);
}
}
/**
* Delete related Entity search terms.
*/
public function deleteEntityTerms(Entity $entity): void
{
$entity->searchTerms()->delete();
}
/**
* Insert the given terms into the database.
* Chunks through the given terms to remain within database limits.
* @param array[] $terms
*/
protected function insertTerms(array $terms): void
{
$chunkedTerms = array_chunk($terms, 500);
foreach ($chunkedTerms as $termChunk) {
SearchTerm::query()->insert($termChunk);
}
}
/**
* Create a scored term array from the given text, where the keys are the terms
* and the values are their scores.
*
* @returns array<string, int>
*/
protected function generateTermScoreMapFromText(string $text, float $scoreAdjustment = 1): array
{
$termMap = $this->textToTermCountMap($text);
foreach ($termMap as $term => $count) {
$termMap[$term] = floor($count * $scoreAdjustment);
}
return $termMap;
}
/**
* Create a scored term array from the given HTML, where the keys are the terms
* and the values are their scores.
*
* @returns array<string, int>
*/
protected function generateTermScoreMapFromHtml(string $html): array
{
if (empty($html)) {
return [];
}
$scoresByTerm = [];
$elementScoreAdjustmentMap = [
'h1' => 10,
'h2' => 5,
'h3' => 4,
'h4' => 3,
'h5' => 2,
'h6' => 1.5,
];
$html = str_ireplace(['<br>', '<br />', '<br/>'], "\n", $html);
$doc = new HtmlDocument($html);
/** @var DOMNode $child */
foreach ($doc->getBodyChildren() as $child) {
$nodeName = $child->nodeName;
$termCounts = $this->textToTermCountMap(trim($child->textContent));
foreach ($termCounts as $term => $count) {
$scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1);
$scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange;
}
}
return $scoresByTerm;
}
/**
* Create a scored term map from the given set of entity tags.
*
* @param Tag[] $tags
*
* @returns array<string, int>
*/
protected function generateTermScoreMapFromTags(array $tags): array
{
$names = [];
$values = [];
foreach ($tags as $tag) {
$names[] = $tag->name;
$values[] = $tag->value;
}
$nameMap = $this->generateTermScoreMapFromText(implode(' ', $names), 3);
$valueMap = $this->generateTermScoreMapFromText(implode(' ', $values), 5);
return $this->mergeTermScoreMaps($nameMap, $valueMap);
}
/**
* For the given text, return an array where the keys are the unique term words
* and the values are the frequency of that term.
*
* @returns array<string, int>
*/
protected function textToTermCountMap(string $text): array
{
$tokenMap = []; // {TextToken => OccurrenceCount}
$splitChars = static::$delimiters;
$token = strtok($text, $splitChars);
while ($token !== false) {
if (!isset($tokenMap[$token])) {
$tokenMap[$token] = 0;
}
$tokenMap[$token]++;
$token = strtok($splitChars);
}
return $tokenMap;
}
/**
* For the given entity, Generate an array of term data details.
* Is the raw term data, not instances of SearchTerm models.
*
* @returns array{term: string, score: float, entity_id: int, entity_type: string}[]
*/
protected function entityToTermDataArray(Entity $entity): array
{
$nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor);
$tagTermsMap = $this->generateTermScoreMapFromTags($entity->tags->all());
if ($entity instanceof Page) {
$bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html);
} else {
$bodyTermsMap = $this->generateTermScoreMapFromText($entity->getAttribute('description') ?? '', $entity->searchFactor);
}
$mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap, $tagTermsMap);
$dataArray = [];
$entityId = $entity->id;
$entityType = $entity->getMorphClass();
foreach ($mergedScoreMap as $term => $score) {
$dataArray[] = [
'term' => $term,
'score' => $score,
'entity_type' => $entityType,
'entity_id' => $entityId,
];
}
return $dataArray;
}
/**
* For the given term data arrays, Merge their contents by term
* while combining any scores.
*
* @param array<string, int>[] ...$scoreMaps
*
* @returns array<string, int>
*/
protected function mergeTermScoreMaps(...$scoreMaps): array
{
$mergedMap = [];
foreach ($scoreMaps as $scoreMap) {
foreach ($scoreMap as $term => $score) {
$mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score;
}
}
return $mergedMap;
}
}