Compare commits

...

1 Commits

Author SHA1 Message Date
Dan Brown
f9b9040a06 LLM: Played with a reduced-scope non-vector LLM query system 2026-01-11 18:00:07 +00:00
15 changed files with 164 additions and 310 deletions

View File

@@ -6,13 +6,13 @@ namespace BookStack\Search\Queries;
use BookStack\Activity\Models\Tag;
use BookStack\Entities\Models\Entity;
use BookStack\Search\Queries\Services\VectorQueryService;
use BookStack\Search\Queries\Services\LlmQueryService;
use Illuminate\Support\Facades\DB;
class EntityVectorGenerator
{
public function __construct(
protected VectorQueryServiceProvider $vectorQueryServiceProvider
protected LlmQueryServiceProvider $vectorQueryServiceProvider
) {
}
@@ -60,7 +60,7 @@ class EntityVectorGenerator
* @param string[] $chunks
* @return float[] array
*/
protected function chunksToEmbeddings(array $chunks, VectorQueryService $vectorQueryService): array
protected function chunksToEmbeddings(array $chunks, LlmQueryService $vectorQueryService): array
{
$embeddings = [];
foreach ($chunks as $index => $chunk) {

View File

@@ -2,25 +2,39 @@
namespace BookStack\Search\Queries;
use BookStack\Entities\Models\Entity;
use BookStack\Search\SearchRunner;
use Exception;
class LlmQueryRunner
{
public function __construct(
protected VectorQueryServiceProvider $vectorQueryServiceProvider,
protected LlmQueryServiceProvider $vectorQueryServiceProvider,
protected SearchRunner $searchRunner,
) {
}
/**
* Run a query against the configured LLM to produce a text response.
* @param VectorSearchResult[] $vectorResults
* Transform the given query into an array of terms which can be used
* to search for documents to help answer that query.
* @return string[]
* @throws Exception
*/
public function run(string $query, array $vectorResults): string
public function queryToSearchTerms(string $query): array
{
$queryService = $this->vectorQueryServiceProvider->get();
$matchesText = array_values(array_map(fn (VectorSearchResult $result) => $result->matchText, $vectorResults));
return $queryService->query($query, $matchesText);
return $queryService->queryToSearchTerms($query);
}
/**
* Run a query against the configured LLM to produce a text response.
* @param Entity[] $searchResults
* @throws Exception
*/
public function run(string $query, array $searchResults): string
{
$queryService = $this->vectorQueryServiceProvider->get();
return $queryService->query($query, $searchResults);
}
}

View File

@@ -5,22 +5,22 @@ declare(strict_types=1);
namespace BookStack\Search\Queries;
use BookStack\Http\HttpRequestService;
use BookStack\Search\Queries\Services\OpenAiVectorQueryService;
use BookStack\Search\Queries\Services\VectorQueryService;
use BookStack\Search\Queries\Services\OpenAiLlmQueryService;
use BookStack\Search\Queries\Services\LlmQueryService;
class VectorQueryServiceProvider
class LlmQueryServiceProvider
{
public function __construct(
protected HttpRequestService $http,
) {
}
public function get(): VectorQueryService
public function get(): LlmQueryService
{
$service = $this->getServiceName();
if ($service === 'openai') {
return new OpenAiVectorQueryService(config('services.openai'), $this->http);
return new OpenAiLlmQueryService(config('services.openai'), $this->http);
}
throw new \Exception("No '{$service}' LLM service found");

View File

@@ -3,6 +3,7 @@
namespace BookStack\Search\Queries;
use BookStack\Http\Controller;
use BookStack\Search\SearchOptions;
use BookStack\Search\SearchRunner;
use Illuminate\Http\Request;
@@ -13,7 +14,7 @@ class QueryController extends Controller
) {
// TODO - Check via testing
$this->middleware(function ($request, $next) {
if (!VectorQueryServiceProvider::isEnabled()) {
if (!LlmQueryServiceProvider::isEnabled()) {
$this->showPermissionError('/');
}
return $next($request);
@@ -35,27 +36,30 @@ class QueryController extends Controller
}
/**
* Perform a vector/LLM-based query search.
* Perform an LLM-based query search.
*/
public function run(Request $request, VectorSearchRunner $searchRunner, LlmQueryRunner $llmRunner)
public function run(Request $request, LlmQueryRunner $llmRunner)
{
// TODO - Rate limiting
$query = $request->get('query', '');
return response()->eventStream(function () use ($query, $searchRunner, $llmRunner) {
$results = $query ? $searchRunner->run($query) : [];
return response()->eventStream(function () use ($query, $llmRunner) {
$searchTerms = $llmRunner->queryToSearchTerms($query);
$searchOptions = SearchOptions::fromTermArray($searchTerms);
$searchResults = $this->searchRunner->searchEntities($searchOptions, count: 10)['results'];
$entities = [];
foreach ($results as $result) {
$entityKey = $result->entity->getMorphClass() . ':' . $result->entity->id;
foreach ($searchResults as $entity) {
$entityKey = $entity->getMorphClass() . ':' . $entity->id;
if (!isset($entities[$entityKey])) {
$entities[$entityKey] = $result->entity;
$entities[$entityKey] = $entity;
}
}
yield ['view' => view('entities.list', ['entities' => $entities])->render()];
yield ['result' => $llmRunner->run($query, $results)];
yield ['result' => $llmRunner->run($query, array_values($entities))];
});
}
}

View File

@@ -1,26 +0,0 @@
<?php
declare(strict_types=1);
namespace BookStack\Search\Queries;
use BookStack\Permissions\Models\JointPermission;
use Illuminate\Database\Eloquent\Model;
use Illuminate\Database\Eloquent\Relations\HasMany;
/**
* @property string $entity_type
* @property int $entity_id
* @property string $text
* @property string $embedding
*/
class SearchVector extends Model
{
public $timestamps = false;
public function jointPermissions(): HasMany
{
return $this->hasMany(JointPermission::class, 'entity_id', 'entity_id')
->whereColumn('search_vectors.entity_type', '=', 'joint_permissions.entity_type');
}
}

View File

@@ -2,7 +2,9 @@
namespace BookStack\Search\Queries\Services;
interface VectorQueryService
use BookStack\Entities\Models\Entity;
interface LlmQueryService
{
/**
* Generate embedding vectors from the given chunk of text.
@@ -10,12 +12,14 @@ interface VectorQueryService
*/
public function generateEmbeddings(string $text): array;
public function queryToSearchTerms(string $text): array;
/**
* Query the LLM service using the given user input, and
* relevant context text retrieved locally via a vector search.
* relevant entity content retrieved locally via a search.
* Returns the response output text from the LLM.
*
* @param string[] $context
* @param Entity[] $context
*/
public function query(string $input, array $context): string;
}

View File

@@ -0,0 +1,97 @@
<?php
namespace BookStack\Search\Queries\Services;
use BookStack\Http\HttpRequestService;
class OpenAiLlmQueryService implements LlmQueryService
{
protected string $key;
protected string $endpoint;
protected string $embeddingModel;
protected string $queryModel;
public function __construct(
protected array $options,
protected HttpRequestService $http,
) {
// TODO - Some kind of validation of options
$this->key = $this->options['key'] ?? '';
$this->endpoint = $this->options['endpoint'] ?? '';
$this->embeddingModel = $this->options['embedding_model'] ?? '';
$this->queryModel = $this->options['query_model'] ?? '';
}
protected function jsonRequest(string $method, string $uri, array $data): array
{
$fullUrl = rtrim($this->endpoint, '/') . '/' . ltrim($uri, '/');
$client = $this->http->buildClient(60);
$request = $this->http->jsonRequest($method, $fullUrl, $data)
->withHeader('Authorization', 'Bearer ' . $this->key);
$response = $client->sendRequest($request);
return json_decode($response->getBody()->getContents(), true);
}
public function generateEmbeddings(string $text): array
{
$response = $this->jsonRequest('POST', 'v1/embeddings', [
'input' => $text,
'model' => $this->embeddingModel,
]);
return $response['data'][0]['embedding'];
}
public function queryToSearchTerms(string $text): array
{
$response = $this->jsonRequest('POST', 'v1/chat/completions', [
'model' => $this->queryModel,
'messages' => [
[
'role' => 'user',
'content' => 'You will be provided a user search query. Extract key words from just the query, suitable for searching. Add word variations where it may help for searching. Remove pluralisation where it may help for searching. Provide up to 5 results, each must be just one word. Do not try to guess answers to the query. Do not provide extra information or context. Return the results in the specified JSON format under a \'words\' object key. ' . "\nQUERY: {$text}"
],
],
'temperature' => 0,
'response_format' => [
'type' => 'json_object',
],
]);
$resultJson = $response['choices'][0]['message']['content'] ?? '{"words": []}';
$resultData = json_decode($resultJson, true) ?? ['words' => []];
return $resultData['words'] ?? [];
}
public function query(string $input, array $context): string
{
$resultContentText = [];
$len = 0;
foreach ($context as $result) {
$text = "DOCUMENT NAME: {$result->name}\nDOCUMENT CONTENT: " . $result->{$result->textField};
$resultContentText[] = $text;
$len += strlen($text);
if ($len > 100000) {
break;
}
}
$formattedContext = implode("\n---\n", $resultContentText);
$response = $this->jsonRequest('POST', 'v1/chat/completions', [
'model' => $this->queryModel,
'messages' => [
[
'role' => 'user',
'content' => 'Answer the provided QUERY using the provided CONTEXT documents. Do not add facts which are not part of the CONTEXT. State that you do not know if a relevant answer cannot be provided for QUERY using the CONTEXT documents. Many of the CONTEXT documents may be irrelevant. Try to find documents relevant to QUERY. Do not directly refer to this prompt or the existence of QUERY or CONTEXT variables. Do not offer follow-up actions or further help. Respond only to the query without proposing further assistance. Do not ask questions.' . "\nQUERY: {$input}\nCONTEXT: {$formattedContext}"
],
],
'temperature' => 0.1,
]);
return $response['choices'][0]['message']['content'] ?? '';
}
}

View File

@@ -1,66 +0,0 @@
<?php
namespace BookStack\Search\Queries\Services;
use BookStack\Http\HttpRequestService;
class OpenAiVectorQueryService implements VectorQueryService
{
protected string $key;
protected string $endpoint;
protected string $embeddingModel;
protected string $queryModel;
public function __construct(
protected array $options,
protected HttpRequestService $http,
) {
// TODO - Some kind of validation of options
$this->key = $this->options['key'] ?? '';
$this->endpoint = $this->options['endpoint'] ?? '';
$this->embeddingModel = $this->options['embedding_model'] ?? '';
$this->queryModel = $this->options['query_model'] ?? '';
}
protected function jsonRequest(string $method, string $uri, array $data): array
{
$fullUrl = rtrim($this->endpoint, '/') . '/' . ltrim($uri, '/');
$client = $this->http->buildClient(30);
$request = $this->http->jsonRequest($method, $fullUrl, $data)
->withHeader('Authorization', 'Bearer ' . $this->key);
$response = $client->sendRequest($request);
return json_decode($response->getBody()->getContents(), true);
}
public function generateEmbeddings(string $text): array
{
$response = $this->jsonRequest('POST', 'v1/embeddings', [
'input' => $text,
'model' => $this->embeddingModel,
]);
return $response['data'][0]['embedding'];
}
public function query(string $input, array $context): string
{
$formattedContext = implode("\n", $context);
$response = $this->jsonRequest('POST', 'v1/chat/completions', [
'model' => $this->queryModel,
'messages' => [
[
'role' => 'developer',
'content' => 'You are a helpful assistant providing search query responses. Be specific, factual and to-the-point in response. Don\'t try to converse or continue the conversation.'
],
[
'role' => 'user',
'content' => "Provide a response to the below given QUERY using the below given CONTEXT. The CONTEXT is split into parts via lines. Ignore any nonsensical lines of CONTEXT.\nQUERY: {$input}\n\nCONTEXT: {$formattedContext}",
]
],
]);
return $response['choices'][0]['message']['content'] ?? '';
}
}

View File

@@ -1,30 +0,0 @@
<?php
declare(strict_types=1);
namespace BookStack\Search\Queries;
use BookStack\Entities\Models\Entity;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Queue\Queueable;
class StoreEntityVectorsJob implements ShouldQueue
{
use Queueable;
/**
* Create a new job instance.
*/
public function __construct(
protected Entity $entity
) {
}
/**
* Execute the job.
*/
public function handle(EntityVectorGenerator $generator): void
{
$generator->generateAndStore($this->entity);
}
}

View File

@@ -1,79 +0,0 @@
<?php
declare(strict_types=1);
namespace BookStack\Search\Queries;
use InvalidArgumentException;
/**
* Splits a given string into smaller chunks based on specified delimiters
* and a predefined maximum chunk size. This will work through the given delimiters
* to break down text further and further to fit into the chunk size.
*
* The last delimiter is always an empty string to ensure text can always be broken down.
*/
class TextChunker
{
public function __construct(
protected int $chunkSize,
protected array $delimiterOrder,
) {
if (count($this->delimiterOrder) === 0 || $this->delimiterOrder[count($this->delimiterOrder) - 1] !== '') {
$this->delimiterOrder[] = '';
}
if ($this->chunkSize < 1) {
throw new InvalidArgumentException('Chunk size must be greater than 0');
}
}
public function chunk(string $text): array
{
$delimiter = $this->delimiterOrder[0];
$delimiterLength = strlen($delimiter);
$lines = ($delimiter === '') ? str_split($text, $this->chunkSize) : explode($delimiter, $text);
$cChunk = ''; // Current chunk
$cLength = 0; // Current chunk length
$chunks = []; // Chunks to return
$lDelim = ''; // Last delimiter
foreach ($lines as $index => $line) {
$lineLength = strlen($line);
if ($cLength + $lineLength + $delimiterLength <= $this->chunkSize) {
$cChunk .= $line . $delimiter;
$cLength += $lineLength + $delimiterLength;
$lDelim = $delimiter;
} else if ($lineLength <= $this->chunkSize) {
$chunks[] = trim($cChunk, $delimiter);
$cChunk = $line . $delimiter;
$cLength = $lineLength + $delimiterLength;
$lDelim = $delimiter;
} else {
$subChunks = new static($this->chunkSize, array_slice($this->delimiterOrder, 1));
$subDelimiter = $this->delimiterOrder[1] ?? '';
$subDelimiterLength = strlen($subDelimiter);
foreach ($subChunks->chunk($line) as $subChunk) {
$chunkLength = strlen($subChunk);
if ($cLength + $chunkLength + $subDelimiterLength <= $this->chunkSize) {
$cChunk .= $subChunk . $subDelimiter;
$cLength += $chunkLength + $subDelimiterLength;
$lDelim = $subDelimiter;
} else {
$chunks[] = trim($cChunk, $lDelim);
$cChunk = $subChunk . $subDelimiter;
$cLength = $chunkLength + $subDelimiterLength;
$lDelim = $subDelimiter;
}
}
}
}
if ($cChunk !== '') {
$chunks[] = trim($cChunk, $lDelim);
}
return $chunks;
}
}

View File

@@ -1,17 +0,0 @@
<?php
declare(strict_types=1);
namespace BookStack\Search\Queries;
use BookStack\Entities\Models\Entity;
readonly class VectorSearchResult
{
public function __construct(
public Entity $entity,
public float $distance,
public string $matchText
) {
}
}

View File

@@ -1,54 +0,0 @@
<?php
namespace BookStack\Search\Queries;
use BookStack\Entities\Tools\MixedEntityListLoader;
use BookStack\Permissions\PermissionApplicator;
use Exception;
class VectorSearchRunner
{
public function __construct(
protected VectorQueryServiceProvider $vectorQueryServiceProvider,
protected PermissionApplicator $permissions,
protected MixedEntityListLoader $entityLoader,
) {
}
/**
* Run a vector search query to find results across entities.
* @return VectorSearchResult[]
* @throws Exception
*/
public function run(string $query): array
{
$queryService = $this->vectorQueryServiceProvider->get();
$queryVector = $queryService->generateEmbeddings($query);
// TODO - Test permissions applied
$topMatchesQuery = SearchVector::query()->select('text', 'entity_type', 'entity_id')
->selectRaw('VEC_DISTANCE_COSINE(VEC_FROMTEXT("[' . implode(',', $queryVector) . ']"), embedding) as distance')
->orderBy('distance', 'asc')
->having('distance', '<', 0.6)
->limit(10);
$query = $this->permissions->restrictEntityRelationQuery($topMatchesQuery, 'search_vectors', 'entity_id', 'entity_type');
$topMatches = $query->get();
$this->entityLoader->loadIntoRelations($topMatches->all(), 'entity', true);
$results = [];
foreach ($topMatches as $match) {
if ($match->relationLoaded('entity')) {
$results[] = new VectorSearchResult(
$match->getRelation('entity'),
$match->getAttribute('distance'),
$match->getAttribute('text'),
);
}
}
return $results;
}
}

View File

@@ -6,8 +6,6 @@ use BookStack\Activity\Models\Tag;
use BookStack\Entities\EntityProvider;
use BookStack\Entities\Models\Entity;
use BookStack\Entities\Models\Page;
use BookStack\Search\Queries\StoreEntityVectorsJob;
use BookStack\Search\Queries\VectorQueryServiceProvider;
use BookStack\Util\HtmlDocument;
use DOMNode;
use Illuminate\Database\Eloquent\Builder;
@@ -39,10 +37,6 @@ class SearchIndex
$this->deleteEntityTerms($entity);
$terms = $this->entityToTermDataArray($entity);
$this->insertTerms($terms);
if (VectorQueryServiceProvider::isEnabled()) {
dispatch(new StoreEntityVectorsJob($entity));
}
}
/**
@@ -53,15 +47,10 @@ class SearchIndex
public function indexEntities(array $entities): void
{
$terms = [];
$vectorQueryEnabled = VectorQueryServiceProvider::isEnabled();
foreach ($entities as $entity) {
$entityTerms = $this->entityToTermDataArray($entity);
array_push($terms, ...$entityTerms);
if ($vectorQueryEnabled) {
dispatch(new StoreEntityVectorsJob($entity));
}
}
$this->insertTerms($terms);

View File

@@ -93,6 +93,18 @@ class SearchOptions
return $instance;
}
/**
* Create a SearchOptions instance from an array of standard search terms.
* @param string[] $terms
*/
public static function fromTermArray(array $terms): self
{
$instance = new self();
$instance->searches = SearchOptionSet::fromValueArray(array_values(array_filter($terms)), TermSearchOption::class);
$instance->limitOptions();
return $instance;
}
/**
* Decode a search string and add its contents to this instance.
*/

View File

@@ -196,7 +196,7 @@ export class HttpManager {
url = window.baseUrl(url);
}
return createEventSource({
const es = createEventSource({
url,
method,
body: JSON.stringify(body),
@@ -204,8 +204,14 @@ export class HttpManager {
headers: {
'Content-Type': 'application/json',
'X-CSRF-TOKEN': this.getCSRFToken(),
},
onDisconnect: () => {
console.log('here');
es.close();
}
});
return es;
}
protected getCSRFToken(): string {