feat: ✨ Refactor it Turso!

2026-02-06 22:18:20 +00:00
parent cabf4f5eab
commit 02698472ce
10 changed files with 769 additions and 19 deletions
@@ -1,6 +1,7 @@
+import turso
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
-
+from datetime import datetime
 import dspy
 from langchain_community.document_loaders import TextLoader
 from langchain_community.vectorstores import FAISS
@@ -22,8 +23,9 @@ MAX_WORKERS=CFG["ingestion"]["max_workers"]
 CHUNK_SIZE=CFG["ingestion"]["chunk_size"]
 CHUNK_OVERLAP=CFG["ingestion"]["chunk_overlap"]
 EMBEDDING_BATCH_SIZE=CFG["ingestion"]["embedding_batch_size"]
+TIMEFILE = CFG["ingestion"]["time_file_location"]

-def load_documents():
+def load_documents(last_update_time):
    docs = []
    data_path = Path(DATA_DIR)

@@ -32,6 +34,10 @@ def load_documents():
        return docs

    for file_path in data_path.rglob("*.md"):
+        file_modified_date = datetime.fromtimestamp(file_path.stat().st_mtime)
+
+        if file_modified_date < last_update_time:
+            continue
        try:
            loader = TextLoader(str(file_path))
            loaded_docs = loader.load()
@@ -90,31 +96,212 @@ def enrich_chunks(chunks: list) -> list:
    return [item[1] for item in enriched_results]


-def store_chunks_locally(chunks, db_path=DATABASE_PATH):
+def embed_chunks(chunks):
+    """
+    Embed chunks and return a list of dictionaries with full metadata.
+    Each dict contains:
+        - file_path
+        - file_name
+        - chunk_data
+        - synopsis
+        - tags
+        - entities
+        - embedding
+    """
    embeddings_model = LocalLMEmbeddings(
        model=EMBEDDING_MODEL,
        base_url=API_BASE,
        batch_size=EMBEDDING_BATCH_SIZE,
    )

-    print(f"Index creation started for {len(chunks)} chunks...")
-    # FAISS.from_documents extracts metadata directly from the Document objects
-    vectorstore = FAISS.from_documents(documents=chunks, embedding=embeddings_model)
-    vectorstore.save_local(db_path)
-    print(f"✅ Successfully stored in FAISS at '{db_path}'")
-    return vectorstore
+    # Prepare list of dictionaries for output
+    embedded_chunks = []
+
+    for i, chunk in enumerate(tqdm(chunks, desc="Embedding chunks")):
+        try:
+            # Extract metadata
+            file_path = chunk.metadata.get("full_path", "unknown")
+            file_name = chunk.metadata.get("source", "unknown")
+            content = chunk.page_content
+
+            # Extract enriched metadata (from IngestionAgent)
+            synopsis = chunk.metadata.get("synopsis", "No summary")
+            tags = chunk.metadata.get("tags", [])
+            entities = chunk.metadata.get("entities", [])
+
+            # Generate embedding
+            embedding = embeddings_model.embed_query(content)
+
+            # Create structured dictionary
+            chunk_data = {
+                "file_path": file_path,
+                "file_name": file_name,
+                "chunk_data": content,
+                "synopsis": synopsis,
+                "tags": tags,
+                "entities": entities,
+                "embedding": embedding,
+                "timestamp": datetime.now().isoformat()
+            }
+
+            embedded_chunks.append(chunk_data)
+
+        except Exception as e:
+            print(f"⚠️ Failed to embed chunk {i}: {e}")
+            # Fallback entry
+            embedded_chunks.append({
+                "file_path": chunk.metadata.get("full_path", "unknown"),
+                "file_name": chunk.metadata.get("source", "unknown"),
+                "chunk_data": content,
+                "synopsis": "Embedding failed",
+                "tags": ["error"],
+                "entities": [],
+                "embedding": [],
+                "timestamp": datetime.now().isoformat()
+            })
+
+    return embedded_chunks
+
+
+def save_to_db(chunk_dicts):
+    """
+    Save a list of dictionaries to the Turso database.
+    Each dict maps to a row in the 'notes' table.
+    """
+    print('connecting to db')
+    con = turso.connect(DATABASE_PATH)
+    print('opening cursor')
+    cur = con.cursor()
+
+    # SQL with named placeholders for clarity and safety
+    insert_sql = """
+    INSERT INTO notes (
+        file_path, file_name, chunk_data, synopsis, tags, entities, embedding, timestamp
+    ) VALUES (?, ?, ?, ?, ?, ?, vector32(?), ?)
+    """
+
+    # Prepare batch data: convert each dict to a tuple in correct order
+    batch_data = []
+    for entry in chunk_dicts:
+        # Convert list of floats to comma-separated string for Turso vector32
+        embedding_str = str(entry["embedding"])
+
+        batch_data.append((
+            entry["file_path"],
+            entry["file_name"],
+            entry["chunk_data"],
+            entry["synopsis"],
+            ",".join(entry["tags"]),  # Store as comma-separated string
+            ",".join(entry["entities"]),  # Store as comma-separated string
+            embedding_str,
+            entry["timestamp"]
+        ))
+    print('data to insert:',len(batch_data))
+    # Execute batch insert
+    cur.executemany(insert_sql, batch_data)
+    con.commit()
+    con.close()
+
+    print(f"✅ Saved {len(batch_data)} chunks to database.")
+
+def create_db():
+    con = turso.connect(DATABASE_PATH)
+    cur = con.cursor()
+
+    cur.execute("""
+    CREATE TABLE IF NOT EXISTS notes (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        file_path TEXT NOT NULL,
+        file_name TEXT NOT NULL,
+        chunk_data TEXT NOT NULL,
+        synopsis TEXT,
+        tags TEXT,  -- comma-separated
+        entities TEXT,  -- comma-separated
+        embedding F32_BLOB(4096),
+        timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
+        UNIQUE(file_path, chunk_data)  -- avoid duplicates
+    )
+    """)
+
+    # Indexes for faster queries
+    cur.execute("CREATE INDEX IF NOT EXISTS idx_embedding ON notes(embedding);")
+    cur.execute("CREATE INDEX IF NOT EXISTS idx_file_path ON notes(file_path);")
+    cur.execute("CREATE INDEX IF NOT EXISTS idx_tags ON notes(tags);")
+    cur.execute("CREATE INDEX IF NOT EXISTS idx_synopsis ON notes(synopsis);")
+
+    con.commit()
+    con.close()
+    print("✅ Database and indexes created.")
+
+def get_last_update_time():
+    try: 
+        with open(TIMEFILE, "r") as file:
+            last_update_str = file.read()
+        last_update = datetime.strptime(last_update_str,"%Y/%m/%d - %H:%M:%S")
+    except FileNotFoundError:
+        print("File Not found, setting time to ingest all files")
+        last_update = datetime(year=2000,month=1,day=1)
+    return last_update
+
+def update_timefile():
+    current_time = datetime.now()
+    current_time_str = current_time.strftime("%Y/%m/%d - %H:%M:%S")
+    with open(TIMEFILE, "w") as file:
+        file.write(current_time_str)
+    return current_time_str


 def main():
-    docs = load_documents()
+    create_db()
+    last_update_time = get_last_update_time()
+    print(f"Last update time: {last_update_time}")
+
+    docs = load_documents(last_update_time)
    if not docs:
+        print("No Recently Updated Files to Ingest")
        return

    chunks = chunk_documents(docs)
+    print(f"Split into {len(chunks)} chunks.")
+
    enriched_chunks = enrich_chunks(chunks)
-    store_chunks_locally(enriched_chunks)
+    print(f"Enriched {len(enriched_chunks)} chunks.")
+
+    embedded_chunks = embed_chunks(enriched_chunks)
+    print(f"Embedded {len(embedded_chunks)} chunks.")
+
+    save_to_db(embedded_chunks)
    print("🎉 Ingestion complete!")

+    updated = update_timefile()
+    print(f"Updated timefile to: {updated}")

 if __name__ == "__main__":
    main()
+
+#TODO: create function to delete rows that match new files coming in
+# and handle danabase insert fails 
+"""
+Traceback (most recent call last):
+  File "/home/cosmic/source/dungeon_masters_vault/.venv/lib/python3.13/site-packages/turso/lib.py", line 643, in executemany
+    result = _run_execute_with_io(stmt, self._connection.extra_io)
+  File "/home/cosmic/source/dungeon_masters_vault/.venv/lib/python3.13/site-packages/turso/lib.py", line 163, in _run_execute_with_io
+    result = stmt.execute()
+turso.Constraint: UNIQUE constraint failed: notes.(file_path, chunk_data) (19)
+
+During handling of the above exception, another exception occurred:
+
+Traceback (most recent call last):
+  File "/home/cosmic/source/dungeon_masters_vault/src/ingest.py", line 280, in <module>
+    main()
+    ~~~~^^
+  File "/home/cosmic/source/dungeon_masters_vault/src/ingest.py", line 273, in main
+    save_to_db(embedded_chunks)
+    ~~~~~~~~~~^^^^^^^^^^^^^^^^^
+  File "/home/cosmic/source/dungeon_masters_vault/src/ingest.py", line 201, in save_to_db
+    cur.executemany(insert_sql, batch_data)
+    ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/cosmic/source/dungeon_masters_vault/.venv/lib/python3.13/site-packages/turso/lib.py", line 656, in executemany
+    raise _map_turso_exception(exc)
+turso.lib.IntegrityError: UNIQUE constraint failed: notes.(file_path, chunk_data) (19)
+"""