mirror of
https://github.com/simon987/sist2.git
synced 2025-12-12 06:58:54 +00:00
refactor index schema, remove sidecar parsing, remove TS
This commit is contained in:
@@ -1,131 +0,0 @@
|
||||
import sqlite3
|
||||
import orjson as json
|
||||
import os
|
||||
import string
|
||||
from hashlib import md5
|
||||
import random
|
||||
from tqdm import tqdm
|
||||
|
||||
schema = """
|
||||
CREATE TABLE thumbnail (
|
||||
id TEXT NOT NULL CHECK (
|
||||
length(id) = 32
|
||||
),
|
||||
num INTEGER NOT NULL,
|
||||
data BLOB NOT NULL,
|
||||
PRIMARY KEY(id, num)
|
||||
) WITHOUT ROWID;
|
||||
CREATE TABLE version (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
date TEXT NOT NULL DEFAULT (CURRENT_TIMESTAMP)
|
||||
);
|
||||
CREATE TABLE document (
|
||||
id TEXT PRIMARY KEY NOT NULL CHECK (
|
||||
length(id) = 32
|
||||
),
|
||||
marked INTEGER NOT NULL DEFAULT (1),
|
||||
version INTEGER NOT NULL REFERENCES version(id),
|
||||
mtime INTEGER NOT NULL,
|
||||
size INTEGER NOT NULL,
|
||||
json_data TEXT NOT NULL CHECK (
|
||||
json_valid(json_data)
|
||||
)
|
||||
);
|
||||
CREATE TABLE delete_list (
|
||||
id TEXT PRIMARY KEY CHECK (
|
||||
length(id) = 32
|
||||
)
|
||||
) WITHOUT ROWID;
|
||||
CREATE TABLE tag (
|
||||
id TEXT NOT NULL,
|
||||
tag TEXT NOT NULL,
|
||||
PRIMARY KEY (id, tag)
|
||||
);
|
||||
CREATE TABLE document_sidecar (
|
||||
id TEXT PRIMARY KEY NOT NULL, json_data TEXT NOT NULL
|
||||
) WITHOUT ROWID;
|
||||
CREATE TABLE descriptor (
|
||||
id TEXT NOT NULL, version_major INTEGER NOT NULL,
|
||||
version_minor INTEGER NOT NULL, version_patch INTEGER NOT NULL,
|
||||
root TEXT NOT NULL, name TEXT NOT NULL,
|
||||
rewrite_url TEXT, timestamp INTEGER NOT NULL
|
||||
);
|
||||
CREATE TABLE stats_treemap (
|
||||
path TEXT NOT NULL, size INTEGER NOT NULL
|
||||
);
|
||||
CREATE TABLE stats_size_agg (
|
||||
bucket INTEGER NOT NULL, count INTEGER NOT NULL
|
||||
);
|
||||
CREATE TABLE stats_date_agg (
|
||||
bucket INTEGER NOT NULL, count INTEGER NOT NULL
|
||||
);
|
||||
CREATE TABLE stats_mime_agg (
|
||||
mime TEXT NOT NULL, size INTEGER NOT NULL,
|
||||
count INTEGER NOT NULL
|
||||
);
|
||||
CREATE TABLE embedding (
|
||||
id TEXT REFERENCES document(id),
|
||||
model_id INTEGER NOT NULL references model(id),
|
||||
start INTEGER NOT NULL,
|
||||
end INTEGER,
|
||||
embedding BLOB NOT NULL,
|
||||
PRIMARY KEY (id, model_id, start)
|
||||
);
|
||||
CREATE TABLE model (
|
||||
id INTEGER PRIMARY KEY,
|
||||
name TEXT NOT NULL UNIQUE CHECK (
|
||||
length(name) < 16
|
||||
),
|
||||
url TEXT,
|
||||
path TEXT NOT NULL UNIQUE,
|
||||
size INTEGER NOT NULL,
|
||||
type TEXT NOT NULL CHECK (
|
||||
type IN ('flat', 'nested')
|
||||
)
|
||||
);
|
||||
"""
|
||||
|
||||
content = "".join(random.choices(string.ascii_letters, k=500))
|
||||
|
||||
|
||||
def gen_document():
|
||||
return [
|
||||
md5(random.randbytes(8)).hexdigest(),
|
||||
json.dumps({
|
||||
"content": content,
|
||||
"mime": "image/jpeg",
|
||||
"extension": "jpeg",
|
||||
"name": "test",
|
||||
"path": "",
|
||||
})
|
||||
]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
DB_NAME = "big_index.sist2"
|
||||
SIZE = 30_000_000
|
||||
|
||||
os.remove(DB_NAME)
|
||||
db = sqlite3.connect(DB_NAME)
|
||||
db.executescript(schema)
|
||||
|
||||
db.executescript("""
|
||||
PRAGMA journal_mode = OFF;
|
||||
PRAGMA synchronous = 0;
|
||||
""")
|
||||
|
||||
for _ in tqdm(range(SIZE), total=SIZE):
|
||||
db.execute(
|
||||
"INSERT INTO document (id, version, mtime, size, json_data) VALUES (?, 1, 1000000, 10000, ?)",
|
||||
gen_document()
|
||||
)
|
||||
|
||||
# 1. Enable rowid from document
|
||||
# 2. CREATE TABLE marked (
|
||||
# id INTEGER PRIMARY KEY,
|
||||
# marked int
|
||||
# );
|
||||
# 3. Set FK for document_sidecar, embedding, tag, thumbnail
|
||||
# 4. Toggle FK if debug
|
||||
|
||||
db.commit()
|
||||
@@ -449,5 +449,4 @@ image/x-sigma-x3f, xf3
|
||||
image/x-sony-arw, arw
|
||||
image/x-sony-sr2, sr2
|
||||
image/x-sony-srf, srf
|
||||
image/x-epson-erf, erf
|
||||
sist2/sidecar, s2meta
|
||||
image/x-epson-erf, erf
|
||||
|
@@ -3,6 +3,7 @@ import zlib
|
||||
mimes = {}
|
||||
noparse = set()
|
||||
ext_in_hash = set()
|
||||
mime_ids = {}
|
||||
|
||||
major_mime = {
|
||||
"sist2": 0,
|
||||
@@ -102,6 +103,9 @@ cnt = 1
|
||||
|
||||
|
||||
def mime_id(mime):
|
||||
if mime in mime_ids:
|
||||
return mime_ids[mime]
|
||||
|
||||
global cnt
|
||||
major = mime.split("/")[0]
|
||||
mime_id = str((major_mime[major] << 16) + cnt)
|
||||
@@ -127,9 +131,7 @@ def mime_id(mime):
|
||||
elif mime == "application/x-empty":
|
||||
cnt -= 1
|
||||
return "1"
|
||||
elif mime == "sist2/sidecar":
|
||||
cnt -= 1
|
||||
return "2"
|
||||
mime_ids[mime] = mime_id
|
||||
return mime_id
|
||||
|
||||
|
||||
@@ -197,4 +199,12 @@ with open("scripts/mime.csv") as f:
|
||||
print(f"case {crc(mime)}: return {clean(mime)};")
|
||||
|
||||
print("default: return 0;}}")
|
||||
|
||||
# mime list
|
||||
|
||||
mime_list = ",".join(mime_id(x) for x in mimes.keys()) + ",0"
|
||||
|
||||
print(f"unsigned int mime_ids[] = {{{mime_list}}};")
|
||||
print("unsigned int* get_mime_ids() { return mime_ids; }")
|
||||
|
||||
print("#endif")
|
||||
|
||||
Reference in New Issue
Block a user