pull

2026-03-04 09:11:53 +00:00
parent e24c0cdf33
commit bbaebf1f70
2 changed files with 5 additions and 84 deletions
@@ -18,4 +18,8 @@ common model attributes - temp & top-k
 QA specific embedding models? 
 Evaluation metrics, how good is it doing? 
-    rate my response!? 
+    rate my response!? 
 examples into prompts & better prompts
 common model attributes - temp & top-k 
@@ -1,83 +0,0 @@
 import turso
 from config_loader import load_config
 from embedding import LocalLMEmbeddings
 CFG = load_config()
 EMBEDDING_MODEL = CFG["models"]["embedding"]
 API_BASE = CFG["api"]["base_url"]
 EMBEDDING_BATCH_SIZE=CFG["ingestion"]["embedding_batch_size"]
 con = turso.connect("dmv.db")
 cur = con.cursor()
 cur.execute("""
 CREATE TABLE IF NOT EXISTS notes (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    file_path TEXT NOT NULL,
    file_name TEXT NOT NULL,
    chunk_data TEXT,
    embedding F32_BLOB(4096),
    timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
  )""")
 cur.execute("CREATE INDEX IF NOT EXISTS idx_embedding ON notes(embedding);")
 # OR, if using libsql vector extension:
 # cur.execute("CREATE INDEX IF NOT EXISTS idx_embedding_vector ON notes(libsql_vector_idx(embedding));")
 embeddings_model = LocalLMEmbeddings(
    model=EMBEDDING_MODEL,
    base_url=API_BASE,
    batch_size=EMBEDDING_BATCH_SIZE,
 )
 texts_to_embed = [
 "The quick brown fox jumped over the lazy dog",
 "Tiffany is my wife, she writes books and watches films",
 "Mazie and Bella are my labradour dogs that are two and three years old, they are white and have a pink nose",
 "The movie Titanic is about a love story on a big boat. but the boat sinks in the end"
 ]
 reply = embeddings_model._post_request(texts_to_embed)
 zipped = zip(texts_to_embed,reply)
 # Instead of looping and executing one INSERT at a time
 # Batch insert using multiple VALUES
 batch_insert_sql = """
 INSERT INTO notes (file_path, file_name, chunk_data, embedding)
 VALUES (?, ?, ?, vector32(?))
 """
 # Prepare batch data
 batch_data = []
 for number, (text, embed) in enumerate(zipped):
    batch_data.append((
        f"path/to/file_{number}",
        f"file_{number}",
        text,
        str(embed)  # format as comma-separated string
    ))
 cur.executemany(batch_insert_sql, batch_data)
 con.commit()
 query_string = ["tell me about a film on a ship"]
 query_reply = embeddings_model._post_request(query_string)
 cur.execute(f"""
 SELECT id,
       file_path,
       file_name,
       chunk_data, 
       vector_distance_cos(embedding, vector32('{query_reply[0]}')) AS distance
 FROM notes
 ORDER BY distance ASC;
 """)
       # vector_extract(embedding)
 print(query_string[0])
 rows = cur.fetchall()
 for row in rows:
    print(row)