0
0
Fork 0
mirror of https://github.com/BookStackApp/BookStack.git synced 2025-04-13 16:29:07 +00:00

Vectors: Built content vector indexing system

This commit is contained in:
Dan Brown 2025-03-24 16:28:14 +00:00
parent 0ec0913846
commit 8452099a5b
No known key found for this signature in database
GPG key ID: 46D9F943C24A2EF9
9 changed files with 269 additions and 1 deletions

View file

@ -22,6 +22,16 @@ return [
// Callback URL for social authentication methods
'callback_url' => env('APP_URL', false),
// LLM Service
// Options: openai
'llm' => env('LLM_SERVICE', ''),
// OpenAI API-compatible service details
'openai' => [
'endpoint' => env('OPENAI_ENDPOINT', 'https://api.openai.com'),
'key' => env('OPENAI_KEY', ''),
],
'github' => [
'client_id' => env('GITHUB_APP_ID', false),
'client_secret' => env('GITHUB_APP_SECRET', false),

View file

@ -6,6 +6,8 @@ use BookStack\Activity\Models\Tag;
use BookStack\Entities\EntityProvider;
use BookStack\Entities\Models\Entity;
use BookStack\Entities\Models\Page;
use BookStack\Search\Vectors\StoreEntityVectorsJob;
use BookStack\Search\Vectors\VectorQueryServiceProvider;
use BookStack\Util\HtmlDocument;
use DOMNode;
use Illuminate\Database\Eloquent\Builder;
@ -25,7 +27,7 @@ class SearchIndex
public static string $softDelimiters = ".-";
public function __construct(
protected EntityProvider $entityProvider
protected EntityProvider $entityProvider,
) {
}
@ -37,6 +39,10 @@ class SearchIndex
$this->deleteEntityTerms($entity);
$terms = $this->entityToTermDataArray($entity);
$this->insertTerms($terms);
if (VectorQueryServiceProvider::isEnabled()) {
dispatch(new StoreEntityVectorsJob($entity));
}
}
/**
@ -47,9 +53,15 @@ class SearchIndex
public function indexEntities(array $entities): void
{
$terms = [];
$vectorQueryEnabled = VectorQueryServiceProvider::isEnabled();
foreach ($entities as $entity) {
$entityTerms = $this->entityToTermDataArray($entity);
array_push($terms, ...$entityTerms);
if ($vectorQueryEnabled) {
dispatch(new StoreEntityVectorsJob($entity));
}
}
$this->insertTerms($terms);

View file

@ -0,0 +1,84 @@
<?php
namespace BookStack\Search\Vectors;
use BookStack\Entities\Models\Entity;
use BookStack\Search\Vectors\Services\VectorQueryService;
use Illuminate\Support\Facades\DB;
class EntityVectorGenerator
{
public function __construct(
protected VectorQueryServiceProvider $vectorQueryServiceProvider
) {
}
public function generateAndStore(Entity $entity): void
{
$vectorService = $this->vectorQueryServiceProvider->get();
$text = $this->entityToPlainText($entity);
$chunks = $this->chunkText($text);
$embeddings = $this->chunksToEmbeddings($chunks, $vectorService);
$this->deleteExistingEmbeddingsForEntity($entity);
$this->storeEmbeddings($embeddings, $chunks, $entity);
}
protected function deleteExistingEmbeddingsForEntity(Entity $entity): void
{
SearchVector::query()
->where('entity_type', '=', $entity->getMorphClass())
->where('entity_id', '=', $entity->id)
->delete();
}
protected function storeEmbeddings(array $embeddings, array $textChunks, Entity $entity): void
{
$toInsert = [];
foreach ($embeddings as $index => $embedding) {
$text = $textChunks[$index];
$toInsert[] = [
'entity_id' => $entity->id,
'entity_type' => $entity->getMorphClass(),
'embedding' => DB::raw('STRING_TO_VECTOR("[' . implode(',', $embedding) . ']")'),
'text' => $text,
];
}
// TODO - Chunk inserts
SearchVector::query()->insert($toInsert);
}
/**
* @param string[] $chunks
* @return float[] array
*/
protected function chunksToEmbeddings(array $chunks, VectorQueryService $vectorQueryService): array
{
$embeddings = [];
foreach ($chunks as $index => $chunk) {
$embeddings[$index] = $vectorQueryService->generateEmbeddings($chunk);
}
return $embeddings;
}
/**
* @return string[]
*/
protected function chunkText(string $text): array
{
// TODO - Join adjacent smaller chunks up
return array_filter(array_map(function (string $section): string {
return trim($section);
}, explode("\n", $text)));
}
protected function entityToPlainText(Entity $entity): string
{
$text = $entity->name . "\n\n" . $entity->{$entity->textField};
// TODO - Add tags
return $text;
}
}

View file

@ -0,0 +1,16 @@
<?php
namespace BookStack\Search\Vectors;
use Illuminate\Database\Eloquent\Model;
/**
* @property string $entity_type
* @property int $entity_id
* @property string $text
* @property string $embedding
*/
class SearchVector extends Model
{
public $timestamps = false;
}

View file

@ -0,0 +1,36 @@
<?php
namespace BookStack\Search\Vectors\Services;
use BookStack\Http\HttpRequestService;
class OpenAiVectorQueryService implements VectorQueryService
{
public function __construct(
protected string $endpoint,
protected string $key,
protected HttpRequestService $http,
) {
}
protected function jsonRequest(string $method, string $uri, array $data): array
{
$fullUrl = rtrim($this->endpoint, '/') . '/' . ltrim($uri, '/');
$client = $this->http->buildClient(10);
$request = $this->http->jsonRequest($method, $fullUrl, $data)
->withHeader('Authorization', 'Bearer ' . $this->key);
$response = $client->sendRequest($request);
return json_decode($response->getBody()->getContents(), true);
}
public function generateEmbeddings(string $text): array
{
$response = $this->jsonRequest('POST', 'v1/embeddings', [
'input' => $text,
'model' => 'text-embedding-3-small',
]);
return $response['data'][0]['embedding'];
}
}

View file

@ -0,0 +1,12 @@
<?php
namespace BookStack\Search\Vectors\Services;
interface VectorQueryService
{
/**
* Generate embedding vectors from the given chunk of text.
* @return float[]
*/
public function generateEmbeddings(string $text): array;
}

View file

@ -0,0 +1,28 @@
<?php
namespace BookStack\Search\Vectors;
use BookStack\Entities\Models\Entity;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Queue\Queueable;
class StoreEntityVectorsJob implements ShouldQueue
{
use Queueable;
/**
* Create a new job instance.
*/
public function __construct(
protected Entity $entity
) {
}
/**
* Execute the job.
*/
public function handle(EntityVectorGenerator $generator): void
{
$generator->generateAndStore($this->entity);
}
}

View file

@ -0,0 +1,38 @@
<?php
namespace BookStack\Search\Vectors;
use BookStack\Http\HttpRequestService;
use BookStack\Search\Vectors\Services\OpenAiVectorQueryService;
use BookStack\Search\Vectors\Services\VectorQueryService;
class VectorQueryServiceProvider
{
public function __construct(
protected HttpRequestService $http,
) {
}
public function get(): VectorQueryService
{
$service = $this->getServiceName();
if ($service === 'openai') {
$key = config('services.openai.key');
$endpoint = config('services.openai.endpoint');
return new OpenAiVectorQueryService($endpoint, $key, $this->http);
}
throw new \Exception("No '{$service}' LLM service found");
}
protected static function getServiceName(): string
{
return strtolower(config('services.llm'));
}
public static function isEnabled(): bool
{
return !empty(static::getServiceName());
}
}

View file

@ -0,0 +1,32 @@
<?php
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
/**
* Run the migrations.
*/
public function up(): void
{
// TODO - Handle compatibility with older databases that don't support vectors
Schema::create('search_vectors', function (Blueprint $table) {
$table->string('entity_type', 100);
$table->integer('entity_id');
$table->text('text');
$table->vector('embedding');
$table->index(['entity_type', 'entity_id']);
});
}
/**
* Reverse the migrations.
*/
public function down(): void
{
Schema::dropIfExists('search_vectors');
}
};