fix(server): use bigrams for cjk (#24285)

* use bigrams for cjk

* update sql

* linting

* actually migrate ocr

* fix backwards test

* use array

* tweaks
This commit is contained in:
Mert
2025-12-01 12:24:37 -05:00
committed by GitHub
parent d8ca210641
commit 95c29a8aea
5 changed files with 203 additions and 47 deletions

View File

@@ -0,0 +1,24 @@
import { Kysely, sql } from 'kysely';
import { tokenizeForSearch } from 'src/utils/database';
export async function up(db: Kysely<any>): Promise<void> {
await sql`truncate ${sql.table('ocr_search')}`.execute(db);
const batch = [];
for await (const { assetId, text } of db
.selectFrom('asset_ocr')
.select(['assetId', sql<string>`string_agg(text, ' ')`.as('text')])
.groupBy('assetId')
.stream()) {
batch.push({ assetId, text: tokenizeForSearch(text) });
if (batch.length >= 5000) {
await db.insertInto('ocr_search').values(batch).execute();
batch.length = 0;
}
}
if (batch.length > 0) {
await db.insertInto('ocr_search').values(batch).execute();
}
}
export async function down(): Promise<void> {}