pull
This commit is contained in:
@@ -18,4 +18,8 @@ common model attributes - temp & top-k
|
|||||||
QA specific embedding models?
|
QA specific embedding models?
|
||||||
|
|
||||||
Evaluation metrics, how good is it doing?
|
Evaluation metrics, how good is it doing?
|
||||||
rate my response!?
|
rate my response!?
|
||||||
|
examples into prompts & better prompts
|
||||||
|
|
||||||
|
common model attributes - temp & top-k
|
||||||
|
|
||||||
|
|||||||
@@ -1,83 +0,0 @@
|
|||||||
import turso
|
|
||||||
|
|
||||||
from config_loader import load_config
|
|
||||||
from embedding import LocalLMEmbeddings
|
|
||||||
|
|
||||||
CFG = load_config()
|
|
||||||
EMBEDDING_MODEL = CFG["models"]["embedding"]
|
|
||||||
API_BASE = CFG["api"]["base_url"]
|
|
||||||
EMBEDDING_BATCH_SIZE=CFG["ingestion"]["embedding_batch_size"]
|
|
||||||
|
|
||||||
con = turso.connect("dmv.db")
|
|
||||||
cur = con.cursor()
|
|
||||||
cur.execute("""
|
|
||||||
CREATE TABLE IF NOT EXISTS notes (
|
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
file_path TEXT NOT NULL,
|
|
||||||
file_name TEXT NOT NULL,
|
|
||||||
chunk_data TEXT,
|
|
||||||
embedding F32_BLOB(4096),
|
|
||||||
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
||||||
)""")
|
|
||||||
|
|
||||||
cur.execute("CREATE INDEX IF NOT EXISTS idx_embedding ON notes(embedding);")
|
|
||||||
# OR, if using libsql vector extension:
|
|
||||||
# cur.execute("CREATE INDEX IF NOT EXISTS idx_embedding_vector ON notes(libsql_vector_idx(embedding));")
|
|
||||||
|
|
||||||
embeddings_model = LocalLMEmbeddings(
|
|
||||||
model=EMBEDDING_MODEL,
|
|
||||||
base_url=API_BASE,
|
|
||||||
batch_size=EMBEDDING_BATCH_SIZE,
|
|
||||||
)
|
|
||||||
|
|
||||||
texts_to_embed = [
|
|
||||||
"The quick brown fox jumped over the lazy dog",
|
|
||||||
"Tiffany is my wife, she writes books and watches films",
|
|
||||||
"Mazie and Bella are my labradour dogs that are two and three years old, they are white and have a pink nose",
|
|
||||||
"The movie Titanic is about a love story on a big boat. but the boat sinks in the end"
|
|
||||||
]
|
|
||||||
|
|
||||||
reply = embeddings_model._post_request(texts_to_embed)
|
|
||||||
zipped = zip(texts_to_embed,reply)
|
|
||||||
|
|
||||||
|
|
||||||
# Instead of looping and executing one INSERT at a time
|
|
||||||
# Batch insert using multiple VALUES
|
|
||||||
batch_insert_sql = """
|
|
||||||
INSERT INTO notes (file_path, file_name, chunk_data, embedding)
|
|
||||||
VALUES (?, ?, ?, vector32(?))
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Prepare batch data
|
|
||||||
batch_data = []
|
|
||||||
for number, (text, embed) in enumerate(zipped):
|
|
||||||
batch_data.append((
|
|
||||||
f"path/to/file_{number}",
|
|
||||||
f"file_{number}",
|
|
||||||
text,
|
|
||||||
str(embed) # format as comma-separated string
|
|
||||||
))
|
|
||||||
|
|
||||||
cur.executemany(batch_insert_sql, batch_data)
|
|
||||||
con.commit()
|
|
||||||
|
|
||||||
query_string = ["tell me about a film on a ship"]
|
|
||||||
query_reply = embeddings_model._post_request(query_string)
|
|
||||||
|
|
||||||
|
|
||||||
cur.execute(f"""
|
|
||||||
SELECT id,
|
|
||||||
file_path,
|
|
||||||
file_name,
|
|
||||||
chunk_data,
|
|
||||||
vector_distance_cos(embedding, vector32('{query_reply[0]}')) AS distance
|
|
||||||
FROM notes
|
|
||||||
ORDER BY distance ASC;
|
|
||||||
""")
|
|
||||||
# vector_extract(embedding)
|
|
||||||
|
|
||||||
print(query_string[0])
|
|
||||||
|
|
||||||
rows = cur.fetchall()
|
|
||||||
for row in rows:
|
|
||||||
print(row)
|
|
||||||
Reference in New Issue
Block a user