BookStackApp_BookStack/app/Search/SearchResultsFormatter.php
Dan Brown 177cfd72bf
Search: Added structure for search term inputs
Sets things up to allow more complex terms ready to handle negation.
2024-10-02 17:31:45 +01:00

237 lines
8.4 KiB
PHP

<?php
namespace BookStack\Search;
use BookStack\Activity\Models\Tag;
use BookStack\Entities\Models\Entity;
use Illuminate\Support\HtmlString;
class SearchResultsFormatter
{
/**
* For the given array of entities, Prepare the models to be shown in search result
* output. This sets a series of additional attributes.
*
* @param Entity[] $results
*/
public function format(array $results, SearchOptions $options): void
{
foreach ($results as $result) {
$this->setSearchPreview($result, $options);
}
}
/**
* Update the given entity model to set attributes used for previews of the item
* primarily within search result lists.
*/
protected function setSearchPreview(Entity $entity, SearchOptions $options): void
{
$textProperty = $entity->textField;
$textContent = $entity->$textProperty;
$relevantSearchOptions = $options->exacts->merge($options->searches);
$terms = $relevantSearchOptions->toValueArray();
$originalContentByNewAttribute = [
'preview_name' => $entity->name,
'preview_content' => $textContent,
];
foreach ($originalContentByNewAttribute as $attributeName => $content) {
$targetLength = ($attributeName === 'preview_name') ? 0 : 260;
$matchRefs = $this->getMatchPositions($content, $terms);
$mergedRefs = $this->sortAndMergeMatchPositions($matchRefs);
$formatted = $this->formatTextUsingMatchPositions($mergedRefs, $content, $targetLength);
$entity->setAttribute($attributeName, new HtmlString($formatted));
}
$tags = $entity->relationLoaded('tags') ? $entity->tags->all() : [];
$this->highlightTagsContainingTerms($tags, $terms);
}
/**
* Highlight tags which match the given terms.
*
* @param Tag[] $tags
* @param string[] $terms
*/
protected function highlightTagsContainingTerms(array $tags, array $terms): void
{
foreach ($tags as $tag) {
$tagName = mb_strtolower($tag->name);
$tagValue = mb_strtolower($tag->value);
foreach ($terms as $term) {
$termLower = mb_strtolower($term);
if (mb_strpos($tagName, $termLower) !== false) {
$tag->setAttribute('highlight_name', true);
}
if (mb_strpos($tagValue, $termLower) !== false) {
$tag->setAttribute('highlight_value', true);
}
}
}
}
/**
* Get positions of the given terms within the given text.
* Is in the array format of [int $startIndex => int $endIndex] where the indexes
* are positions within the provided text.
*
* @return array<int, int>
*/
protected function getMatchPositions(string $text, array $terms): array
{
$matchRefs = [];
$text = mb_strtolower($text);
foreach ($terms as $term) {
$offset = 0;
$term = mb_strtolower($term);
$pos = mb_strpos($text, $term, $offset);
while ($pos !== false) {
$end = $pos + mb_strlen($term);
$matchRefs[$pos] = $end;
$offset = $end;
$pos = mb_strpos($text, $term, $offset);
}
}
return $matchRefs;
}
/**
* Sort the given match positions before merging them where they're
* adjacent or where they overlap.
*
* @param array<int, int> $matchPositions
*
* @return array<int, int>
*/
protected function sortAndMergeMatchPositions(array $matchPositions): array
{
ksort($matchPositions);
$mergedRefs = [];
$lastStart = 0;
$lastEnd = 0;
foreach ($matchPositions as $start => $end) {
if ($start > $lastEnd) {
$mergedRefs[$start] = $end;
$lastStart = $start;
$lastEnd = $end;
} elseif ($end > $lastEnd) {
$mergedRefs[$lastStart] = $end;
$lastEnd = $end;
}
}
return $mergedRefs;
}
/**
* Format the given original text, returning a version where terms are highlighted within.
* Returned content is in HTML text format.
* A given $targetLength of 0 asserts no target length limit.
*
* This is a complex function but written to be relatively efficient, going through the term matches in order
* so that we're only doing a one-time loop through of the matches. There is no further searching
* done within here.
*/
protected function formatTextUsingMatchPositions(array $matchPositions, string $originalText, int $targetLength): string
{
$maxEnd = mb_strlen($originalText);
$fetchAll = ($targetLength === 0);
$contextLength = ($fetchAll ? 0 : 32);
$firstStart = null;
$lastEnd = 0;
$content = '';
$contentTextLength = 0;
if ($fetchAll) {
$targetLength = $maxEnd * 2;
}
foreach ($matchPositions as $start => $end) {
// Get our outer text ranges for the added context we want to show upon the result.
$contextStart = max($start - $contextLength, 0, $lastEnd);
$contextEnd = min($end + $contextLength, $maxEnd);
// Adjust the start if we're going to be touching the previous match.
$startDiff = $start - $lastEnd;
if ($startDiff < 0) {
$contextStart = $start;
// Trims off '$startDiff' number of characters to bring it back to the start
// if this current match zone.
$content = mb_substr($content, 0, mb_strlen($content) + $startDiff);
$contentTextLength += $startDiff;
}
// Add ellipsis between results
if (!$fetchAll && $contextStart !== 0 && $contextStart !== $start) {
$content .= ' ...';
$contentTextLength += 4;
} elseif ($fetchAll) {
// Or fill in gap since the previous match
$fillLength = $contextStart - $lastEnd;
$content .= e(mb_substr($originalText, $lastEnd, $fillLength));
$contentTextLength += $fillLength;
}
// Add our content including the bolded matching text
$content .= e(mb_substr($originalText, $contextStart, $start - $contextStart));
$contentTextLength += $start - $contextStart;
$content .= '<strong>' . e(mb_substr($originalText, $start, $end - $start)) . '</strong>';
$contentTextLength += $end - $start;
$content .= e(mb_substr($originalText, $end, $contextEnd - $end));
$contentTextLength += $contextEnd - $end;
// Update our last end position
$lastEnd = $contextEnd;
// Update the first start position if it's not already been set
if (is_null($firstStart)) {
$firstStart = $contextStart;
}
// Stop if we're near our target
if ($contentTextLength >= $targetLength - 10) {
break;
}
}
// Just copy out the content if we haven't moved along anywhere.
if ($lastEnd === 0) {
$content = e(mb_substr($originalText, 0, $targetLength));
$contentTextLength = $targetLength;
$lastEnd = $targetLength;
}
// Pad out the end if we're low
$remainder = $targetLength - $contentTextLength;
if ($remainder > 10) {
$padEndLength = min($maxEnd - $lastEnd, $remainder);
$content .= e(mb_substr($originalText, $lastEnd, $padEndLength));
$lastEnd += $padEndLength;
$contentTextLength += $padEndLength;
}
// Pad out the start if we're still low
$remainder = $targetLength - $contentTextLength;
$firstStart = $firstStart ?: 0;
if (!$fetchAll && $remainder > 10 && $firstStart !== 0) {
$padStart = max(0, $firstStart - $remainder);
$content = ($padStart === 0 ? '' : '...') . e(mb_substr($originalText, $padStart, $firstStart - $padStart)) . mb_substr($content, 4);
}
// Add ellipsis if we're not at the end
if ($lastEnd < $maxEnd) {
$content .= '...';
}
return $content;
}
}