sist2/scripts/generate_big_index.py

132 lines
3.0 KiB
Python

import sqlite3
import orjson as json
import os
import string
from hashlib import md5
import random
from tqdm import tqdm
schema = """
CREATE TABLE thumbnail (
id TEXT NOT NULL CHECK (
length(id) = 32
),
num INTEGER NOT NULL,
data BLOB NOT NULL,
PRIMARY KEY(id, num)
) WITHOUT ROWID;
CREATE TABLE version (
id INTEGER PRIMARY KEY AUTOINCREMENT,
date TEXT NOT NULL DEFAULT (CURRENT_TIMESTAMP)
);
CREATE TABLE document (
id TEXT PRIMARY KEY NOT NULL CHECK (
length(id) = 32
),
marked INTEGER NOT NULL DEFAULT (1),
version INTEGER NOT NULL REFERENCES version(id),
mtime INTEGER NOT NULL,
size INTEGER NOT NULL,
json_data TEXT NOT NULL CHECK (
json_valid(json_data)
)
);
CREATE TABLE delete_list (
id TEXT PRIMARY KEY CHECK (
length(id) = 32
)
) WITHOUT ROWID;
CREATE TABLE tag (
id TEXT NOT NULL,
tag TEXT NOT NULL,
PRIMARY KEY (id, tag)
);
CREATE TABLE document_sidecar (
id TEXT PRIMARY KEY NOT NULL, json_data TEXT NOT NULL
) WITHOUT ROWID;
CREATE TABLE descriptor (
id TEXT NOT NULL, version_major INTEGER NOT NULL,
version_minor INTEGER NOT NULL, version_patch INTEGER NOT NULL,
root TEXT NOT NULL, name TEXT NOT NULL,
rewrite_url TEXT, timestamp INTEGER NOT NULL
);
CREATE TABLE stats_treemap (
path TEXT NOT NULL, size INTEGER NOT NULL
);
CREATE TABLE stats_size_agg (
bucket INTEGER NOT NULL, count INTEGER NOT NULL
);
CREATE TABLE stats_date_agg (
bucket INTEGER NOT NULL, count INTEGER NOT NULL
);
CREATE TABLE stats_mime_agg (
mime TEXT NOT NULL, size INTEGER NOT NULL,
count INTEGER NOT NULL
);
CREATE TABLE embedding (
id TEXT REFERENCES document(id),
model_id INTEGER NOT NULL references model(id),
start INTEGER NOT NULL,
end INTEGER,
embedding BLOB NOT NULL,
PRIMARY KEY (id, model_id, start)
);
CREATE TABLE model (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL UNIQUE CHECK (
length(name) < 16
),
url TEXT,
path TEXT NOT NULL UNIQUE,
size INTEGER NOT NULL,
type TEXT NOT NULL CHECK (
type IN ('flat', 'nested')
)
);
"""
content = "".join(random.choices(string.ascii_letters, k=500))
def gen_document():
return [
md5(random.randbytes(8)).hexdigest(),
json.dumps({
"content": content,
"mime": "image/jpeg",
"extension": "jpeg",
"name": "test",
"path": "",
})
]
if __name__ == "__main__":
DB_NAME = "big_index.sist2"
SIZE = 30_000_000
os.remove(DB_NAME)
db = sqlite3.connect(DB_NAME)
db.executescript(schema)
db.executescript("""
PRAGMA journal_mode = OFF;
PRAGMA synchronous = 0;
""")
for _ in tqdm(range(SIZE), total=SIZE):
db.execute(
"INSERT INTO document (id, version, mtime, size, json_data) VALUES (?, 1, 1000000, 10000, ?)",
gen_document()
)
# 1. Enable rowid from document
# 2. CREATE TABLE marked (
# id INTEGER PRIMARY KEY,
# marked int
# );
# 3. Set FK for document_sidecar, embedding, tag, thumbnail
# 4. Toggle FK if debug
db.commit()