refactor index schema, remove sidecar parsing, remove TS

This commit is contained in:
2023-09-05 18:59:18 -04:00
parent b81ccebdb1
commit 8fdb832c85
84 changed files with 1420 additions and 2445 deletions

View File

@@ -1,131 +0,0 @@
import sqlite3
import orjson as json
import os
import string
from hashlib import md5
import random
from tqdm import tqdm
schema = """
CREATE TABLE thumbnail (
id TEXT NOT NULL CHECK (
length(id) = 32
),
num INTEGER NOT NULL,
data BLOB NOT NULL,
PRIMARY KEY(id, num)
) WITHOUT ROWID;
CREATE TABLE version (
id INTEGER PRIMARY KEY AUTOINCREMENT,
date TEXT NOT NULL DEFAULT (CURRENT_TIMESTAMP)
);
CREATE TABLE document (
id TEXT PRIMARY KEY NOT NULL CHECK (
length(id) = 32
),
marked INTEGER NOT NULL DEFAULT (1),
version INTEGER NOT NULL REFERENCES version(id),
mtime INTEGER NOT NULL,
size INTEGER NOT NULL,
json_data TEXT NOT NULL CHECK (
json_valid(json_data)
)
);
CREATE TABLE delete_list (
id TEXT PRIMARY KEY CHECK (
length(id) = 32
)
) WITHOUT ROWID;
CREATE TABLE tag (
id TEXT NOT NULL,
tag TEXT NOT NULL,
PRIMARY KEY (id, tag)
);
CREATE TABLE document_sidecar (
id TEXT PRIMARY KEY NOT NULL, json_data TEXT NOT NULL
) WITHOUT ROWID;
CREATE TABLE descriptor (
id TEXT NOT NULL, version_major INTEGER NOT NULL,
version_minor INTEGER NOT NULL, version_patch INTEGER NOT NULL,
root TEXT NOT NULL, name TEXT NOT NULL,
rewrite_url TEXT, timestamp INTEGER NOT NULL
);
CREATE TABLE stats_treemap (
path TEXT NOT NULL, size INTEGER NOT NULL
);
CREATE TABLE stats_size_agg (
bucket INTEGER NOT NULL, count INTEGER NOT NULL
);
CREATE TABLE stats_date_agg (
bucket INTEGER NOT NULL, count INTEGER NOT NULL
);
CREATE TABLE stats_mime_agg (
mime TEXT NOT NULL, size INTEGER NOT NULL,
count INTEGER NOT NULL
);
CREATE TABLE embedding (
id TEXT REFERENCES document(id),
model_id INTEGER NOT NULL references model(id),
start INTEGER NOT NULL,
end INTEGER,
embedding BLOB NOT NULL,
PRIMARY KEY (id, model_id, start)
);
CREATE TABLE model (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL UNIQUE CHECK (
length(name) < 16
),
url TEXT,
path TEXT NOT NULL UNIQUE,
size INTEGER NOT NULL,
type TEXT NOT NULL CHECK (
type IN ('flat', 'nested')
)
);
"""
content = "".join(random.choices(string.ascii_letters, k=500))
def gen_document():
return [
md5(random.randbytes(8)).hexdigest(),
json.dumps({
"content": content,
"mime": "image/jpeg",
"extension": "jpeg",
"name": "test",
"path": "",
})
]
if __name__ == "__main__":
DB_NAME = "big_index.sist2"
SIZE = 30_000_000
os.remove(DB_NAME)
db = sqlite3.connect(DB_NAME)
db.executescript(schema)
db.executescript("""
PRAGMA journal_mode = OFF;
PRAGMA synchronous = 0;
""")
for _ in tqdm(range(SIZE), total=SIZE):
db.execute(
"INSERT INTO document (id, version, mtime, size, json_data) VALUES (?, 1, 1000000, 10000, ?)",
gen_document()
)
# 1. Enable rowid from document
# 2. CREATE TABLE marked (
# id INTEGER PRIMARY KEY,
# marked int
# );
# 3. Set FK for document_sidecar, embedding, tag, thumbnail
# 4. Toggle FK if debug
db.commit()

View File

@@ -449,5 +449,4 @@ image/x-sigma-x3f, xf3
image/x-sony-arw, arw
image/x-sony-sr2, sr2
image/x-sony-srf, srf
image/x-epson-erf, erf
sist2/sidecar, s2meta
image/x-epson-erf, erf
1 application/x-matlab-data mat
449 image/x-sony-arw arw
450 image/x-sony-sr2 sr2
451 image/x-sony-srf srf
452 image/x-epson-erf erf
sist2/sidecar s2meta

View File

@@ -3,6 +3,7 @@ import zlib
mimes = {}
noparse = set()
ext_in_hash = set()
mime_ids = {}
major_mime = {
"sist2": 0,
@@ -102,6 +103,9 @@ cnt = 1
def mime_id(mime):
if mime in mime_ids:
return mime_ids[mime]
global cnt
major = mime.split("/")[0]
mime_id = str((major_mime[major] << 16) + cnt)
@@ -127,9 +131,7 @@ def mime_id(mime):
elif mime == "application/x-empty":
cnt -= 1
return "1"
elif mime == "sist2/sidecar":
cnt -= 1
return "2"
mime_ids[mime] = mime_id
return mime_id
@@ -197,4 +199,12 @@ with open("scripts/mime.csv") as f:
print(f"case {crc(mime)}: return {clean(mime)};")
print("default: return 0;}}")
# mime list
mime_list = ",".join(mime_id(x) for x in mimes.keys()) + ",0"
print(f"unsigned int mime_ids[] = {{{mime_list}}};")
print("unsigned int* get_mime_ids() { return mime_ids; }")
print("#endif")