diff --git a/TODO b/TODO index 5133f06..de347e3 100644 --- a/TODO +++ b/TODO @@ -18,4 +18,8 @@ common model attributes - temp & top-k QA specific embedding models? Evaluation metrics, how good is it doing? - rate my response!? \ No newline at end of file + rate my response!? +examples into prompts & better prompts + +common model attributes - temp & top-k + diff --git a/src/testing_turso.py b/src/testing_turso.py deleted file mode 100644 index 0b2cdc3..0000000 --- a/src/testing_turso.py +++ /dev/null @@ -1,83 +0,0 @@ -import turso - -from config_loader import load_config -from embedding import LocalLMEmbeddings - -CFG = load_config() -EMBEDDING_MODEL = CFG["models"]["embedding"] -API_BASE = CFG["api"]["base_url"] -EMBEDDING_BATCH_SIZE=CFG["ingestion"]["embedding_batch_size"] - -con = turso.connect("dmv.db") -cur = con.cursor() -cur.execute(""" -CREATE TABLE IF NOT EXISTS notes ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - file_path TEXT NOT NULL, - file_name TEXT NOT NULL, - chunk_data TEXT, - embedding F32_BLOB(4096), - timestamp DATETIME DEFAULT CURRENT_TIMESTAMP - )""") - -cur.execute("CREATE INDEX IF NOT EXISTS idx_embedding ON notes(embedding);") -# OR, if using libsql vector extension: -# cur.execute("CREATE INDEX IF NOT EXISTS idx_embedding_vector ON notes(libsql_vector_idx(embedding));") - -embeddings_model = LocalLMEmbeddings( - model=EMBEDDING_MODEL, - base_url=API_BASE, - batch_size=EMBEDDING_BATCH_SIZE, -) - -texts_to_embed = [ -"The quick brown fox jumped over the lazy dog", -"Tiffany is my wife, she writes books and watches films", -"Mazie and Bella are my labradour dogs that are two and three years old, they are white and have a pink nose", -"The movie Titanic is about a love story on a big boat. but the boat sinks in the end" -] - -reply = embeddings_model._post_request(texts_to_embed) -zipped = zip(texts_to_embed,reply) - - -# Instead of looping and executing one INSERT at a time -# Batch insert using multiple VALUES -batch_insert_sql = """ -INSERT INTO notes (file_path, file_name, chunk_data, embedding) -VALUES (?, ?, ?, vector32(?)) -""" - -# Prepare batch data -batch_data = [] -for number, (text, embed) in enumerate(zipped): - batch_data.append(( - f"path/to/file_{number}", - f"file_{number}", - text, - str(embed) # format as comma-separated string - )) - -cur.executemany(batch_insert_sql, batch_data) -con.commit() - -query_string = ["tell me about a film on a ship"] -query_reply = embeddings_model._post_request(query_string) - - -cur.execute(f""" -SELECT id, - file_path, - file_name, - chunk_data, - vector_distance_cos(embedding, vector32('{query_reply[0]}')) AS distance -FROM notes -ORDER BY distance ASC; -""") - # vector_extract(embedding) - -print(query_string[0]) - -rows = cur.fetchall() -for row in rows: - print(row) \ No newline at end of file