This commit is contained in:
2026-03-04 09:11:53 +00:00
parent e24c0cdf33
commit bbaebf1f70
2 changed files with 5 additions and 84 deletions
+4
View File
@@ -19,3 +19,7 @@ QA specific embedding models?
Evaluation metrics, how good is it doing?
rate my response!?
examples into prompts & better prompts
common model attributes - temp & top-k
-83
View File
@@ -1,83 +0,0 @@
import turso
from config_loader import load_config
from embedding import LocalLMEmbeddings
CFG = load_config()
EMBEDDING_MODEL = CFG["models"]["embedding"]
API_BASE = CFG["api"]["base_url"]
EMBEDDING_BATCH_SIZE=CFG["ingestion"]["embedding_batch_size"]
con = turso.connect("dmv.db")
cur = con.cursor()
cur.execute("""
CREATE TABLE IF NOT EXISTS notes (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_path TEXT NOT NULL,
file_name TEXT NOT NULL,
chunk_data TEXT,
embedding F32_BLOB(4096),
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
)""")
cur.execute("CREATE INDEX IF NOT EXISTS idx_embedding ON notes(embedding);")
# OR, if using libsql vector extension:
# cur.execute("CREATE INDEX IF NOT EXISTS idx_embedding_vector ON notes(libsql_vector_idx(embedding));")
embeddings_model = LocalLMEmbeddings(
model=EMBEDDING_MODEL,
base_url=API_BASE,
batch_size=EMBEDDING_BATCH_SIZE,
)
texts_to_embed = [
"The quick brown fox jumped over the lazy dog",
"Tiffany is my wife, she writes books and watches films",
"Mazie and Bella are my labradour dogs that are two and three years old, they are white and have a pink nose",
"The movie Titanic is about a love story on a big boat. but the boat sinks in the end"
]
reply = embeddings_model._post_request(texts_to_embed)
zipped = zip(texts_to_embed,reply)
# Instead of looping and executing one INSERT at a time
# Batch insert using multiple VALUES
batch_insert_sql = """
INSERT INTO notes (file_path, file_name, chunk_data, embedding)
VALUES (?, ?, ?, vector32(?))
"""
# Prepare batch data
batch_data = []
for number, (text, embed) in enumerate(zipped):
batch_data.append((
f"path/to/file_{number}",
f"file_{number}",
text,
str(embed) # format as comma-separated string
))
cur.executemany(batch_insert_sql, batch_data)
con.commit()
query_string = ["tell me about a film on a ship"]
query_reply = embeddings_model._post_request(query_string)
cur.execute(f"""
SELECT id,
file_path,
file_name,
chunk_data,
vector_distance_cos(embedding, vector32('{query_reply[0]}')) AS distance
FROM notes
ORDER BY distance ASC;
""")
# vector_extract(embedding)
print(query_string[0])
rows = cur.fetchall()
for row in rows:
print(row)