feat: ✨ Working PoC of the Dungeon Masters Vault

2026-01-27 21:24:18 +00:00
parent 645e9461ce
commit 4296a4df88
15 changed files with 347 additions and 563 deletions
@@ -1,228 +1,118 @@
-# ingest.py
-
-import os
-import json
-import dspy
-import turso
-import requests
-
 import json
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import List
-from tqdm import tqdm
-from langchain_core.embeddings import Embeddings
-from langchain_community.vectorstores import FAISS
-from langchain_core.documents import Document
-from typing import List
 from pathlib import Path
-from langchain_community.document_loaders import TextLoader
-from langchain_text_splitters import RecursiveCharacterTextSplitter

+import dspy
+from langchain_community.document_loaders import TextLoader
+from langchain_community.vectorstores import FAISS
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from tqdm import tqdm
+
+from embedding import LocalLMEmbeddings
 from experts.ingestion_agent import IngestionAgent

-# exit() 
-
-CHROMA_PATH = "vector_vault"
 DATA_DIR = "/home/cosmic/DnD"

 def load_documents():
-    """
-    Recursively walk through DATA_DIR and load all .md files as plain text.
-    Each document gets metadata including source filename and full path.
-    Ideal for RAG embedding pipelines.
-    """
    docs = []
-    
-    # Define loader mapping
-    loaders = {
-        ".md": TextLoader,
-    }
-
-    data_path = Path(DATA_DIR)  # Ensure DATA_DIR is defined elsewhere as a string or Path
+    data_path = Path(DATA_DIR)

    if not data_path.exists() or not data_path.is_dir():
-        print(f"⚠️ Data directory '{DATA_DIR}' does not exist or is not a directory.")
+        print(f"⚠️ Data directory '{DATA_DIR}' does not exist.")
        return docs

-    # Walk recursively through all files
-    for file_path in data_path.rglob("*"):
-        if file_path.is_file() and file_path.suffix.lower() == ".md":
-            try:
-                loader = loaders[file_path.suffix](file_path)
-                loaded_docs = loader.load()
+    for file_path in data_path.rglob("*.md"):
+        try:
+            loader = TextLoader(str(file_path))
+            loaded_docs = loader.load()

-                # Add metadata to each document
-                for doc in loaded_docs:
-                    doc.metadata["source"] = file_path.name      # e.g., "document.md"
-                    doc.metadata["full_path"] = str(file_path)   # e.g., "/data/docs/document.md"
+            for doc in loaded_docs:
+                # Ensure these keys are set before splitting
+                doc.metadata["source"] = file_path.name
+                doc.metadata["full_path"] = str(file_path.absolute())

-                docs.extend(loaded_docs)
-                print(f"✅ Loaded: {file_path}")  # Remove this line if you want it silent
+            docs.extend(loaded_docs)
+            print(f"✅ Loaded: {file_path.name}")
+        except Exception as e:
+            print(f"❌ Failed to load {file_path}: {e}")

-            except Exception as e:
-                print(f"❌ Failed to load {file_path}: {e}")
-
-    print(f"📊 Total documents loaded: {len(docs)}")
    return docs

-
 def chunk_documents(docs):
+    # LangChain preserves metadata during splitting automatically
    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=800,
-        chunk_overlap=100,
+        chunk_size=800, 
+        chunk_overlap=100, 
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    return text_splitter.split_documents(docs)

-
-
-
-
-
-def enrich_chunks(chunks: List) -> List:
-    enriched = []
-    # Define your base model name — the same for all 10 slots
+def enrich_chunks(chunks: list) -> list:
    MODEL_BASE = "lm_studio/qwen/qwen3-8b"
    API_BASE = "http://192.168.0.49:1234/v1/"
-    dspy.configure(lm=dspy.LM("lm_studio/qwen/qwen3-8b", api_base="http://192.168.0.49:1234/v1/"))
-
-    def process_single_chunk(args):
-        i, chunk = args
-        lm_index = i % 8
-        print(f"Processing chunk {i+1}/{len(chunks)} | using model {lm_index}")
-
+    
+    def process_single_chunk(indexed_chunk):
+        idx, chunk = indexed_chunk
+        lm_index = idx % 8
+        
        try:
-            with dspy.context(lm=dspy.LM(f"{MODEL_BASE}:{lm_index}", api_base = API_BASE)):
-                response = IngestionAgent().ingest(note=chunk)  # ← Uses thread's selected LM!
-            
-            answer = response.answer
-            start = answer.find('{')
-            end = answer.rfind('}') + 1
-            json_str = answer[start:end]
-            metadata = json.loads(json_str)
+            # Configure context for this specific thread
+            with dspy.context(lm=dspy.LM(f"{MODEL_BASE}:{lm_index}", api_base=API_BASE)):
+                # Pass the text, but we will update the original chunk object
+                response = IngestionAgent().ingest(note=chunk.page_content) 
+                
+                answer = response.answer
+                start = answer.find("{")
+                end = answer.rfind("}") + 1
+                metadata_extracted = json.loads(answer[start:end])
+                
+                # UPDATE: Put AI data in a sub-key to avoid overwriting 'source'
+                chunk.metadata["enrichment"] = metadata_extracted
+                # Also flatten tags for easier searching if needed
+                if "tags" in metadata_extracted:
+                    chunk.metadata["tags"] = metadata_extracted["tags"]

        except Exception as e:
-            print(f"⚠️ Failed to parse JSON for chunk {i}: {e}")
-            metadata = {"synopsis": "Summary failed", "tags": ["error"]}
+            # If enrichment fails, we KEEP the chunk but flag the error
+            # This ensures 'source' and 'full_path' are NEVER lost
+            chunk.metadata["enrichment_error"] = str(e)
+            chunk.metadata["tags"] = ["error"]

-        # Update the chunk's metadata
-        chunk.metadata.update(metadata)
-        return chunk
+        return idx, chunk

-    # Run 10 parallel workers — each will pick a different model slot
+    enriched_results = []
    with ThreadPoolExecutor(max_workers=8) as executor:
-        futures = [executor.submit(process_single_chunk, (i, chunk)) for i, chunk in enumerate(chunks)]
+        # Wrap chunks in enumerate to keep track of order
+        futures = [executor.submit(process_single_chunk, (i, c)) for i, c in enumerate(chunks)]

        for future in tqdm(as_completed(futures), total=len(chunks), desc="Enriching chunks"):
-            enriched.append(future.result())
+            enriched_results.append(future.result())

-    # Restore original order
-    enriched.sort(key=lambda x: chunks.index(x))
+    # Sort by the index (first element of tuple) and return only the chunk
+    enriched_results.sort(key=lambda x: x[0])
+    return [item[1] for item in enriched_results]

-    return enriched
-
-class PrecomputedEmbeddings(Embeddings):
-    def __init__(self, embeddings: List[List[float]]):
-        self.embeddings = embeddings  # Store all precomputed vectors
-
-    def embed_documents(self, texts: List[str]) -> List[List[float]]:
-        return self.embeddings  # Return the precomputed ones (order must match!)
-    
-    def embed_query(self, text):
-        return self.embeddings[0]
-
-def embedder(texts: List[str]) -> List[List[float]]:
-    embeddings = []
-    base_url = "http://192.168.0.49:1234"  # ✅ Add 'http://'
-    embed_url = f"{base_url}/v1/embeddings"
-    headers = {"Content-Type": "application/json"}
-
-    for text in texts:
-        payload = {
-            "model": "text-embedding-qwen3-embedding-8b",
-            "input": text
-        }
-
-        try:
-            response = requests.post(embed_url, json=payload, headers=headers)  # ✅ POST not GET
-            if response.status_code == 200:
-                data = response.json()  # ✅ Parse JSON!
-                embedding = data["data"][0]["embedding"]  # ✅ Extract the actual vector
-                embeddings.append(embedding)
-            else:
-                print(f"❌ Embedding failed for '{text[:30]}...': {response.status_code} - {response.text}")
-                # Optionally: insert placeholder zeros if you need to continue
-                # embeddings.append([0.0] * 768)  # ← adjust dimension as needed!
-        except Exception as e:
-            print(f"⚠️ Exception embedding '{text[:30]}...': {e}")
-            # embeddings.append([0.0] * 768)  # fallback
-
-    return embeddings
-
-def store_chunks_with_embeddings_locally(chunks, db_path="./local_faiss_db"):
-    """
-    Stores pre-computed chunks and their embeddings into a local FAISS database.
-    
-    Args:
-        chunks: list of LangChain Document objects (with page_content and metadata)
-        embeddings: list of embedding vectors (list of lists of floats) — must match length of chunks
-        db_path: where to save the FAISS index files locally
-    """
-
-    texts = [chunk.page_content for chunk in chunks]
-    embeddings = embedder(texts)
-    if len(chunks) != len(embeddings):
-        raise ValueError(f"Mismatch! Got {len(chunks)} chunks but {len(embeddings)} embeddings.")
-
-    # Create LangChain Document list (we already have this)
-    documents = chunks  # assuming they're already Document objects
-
-    # Build FAISS vectorstore using precomputed embeddings
-    # FAISS.from_embeddings() lets us pass our own embeddings + texts
-    vectorstore = FAISS.from_embeddings(
-        text_embeddings=list(zip([doc.page_content for doc in documents], embeddings)),
-        embedding=PrecomputedEmbeddings(embeddings[0])  # We’ll define this next
+def store_chunks_locally(chunks, db_path="./local_faiss_db"):
+    embeddings_model = LocalLMEmbeddings(
+        model="text-embedding-qwen3-embedding-8b",
+        base_url="http://192.168.0.49:1234",
+        batch_size=32,
    )

-    # Save to disk
+    print(f"Index creation started for {len(chunks)} chunks...")
+    # FAISS.from_documents extracts metadata directly from the Document objects
+    vectorstore = FAISS.from_documents(documents=chunks, embedding=embeddings_model)
    vectorstore.save_local(db_path)
-    print(f"✅ Successfully stored {len(chunks)} chunks + embeddings into local FAISS DB at '{db_path}'")
-
-# # Store in Turso
-# def store_in_turso(chunks):
-#     ## needs refactor, not using chroma
-#     client = turso.PersistentClient(path=CHROMA_PATH)
-#     collection = client.get_or_create_collection("documents")
-
-#     ids = [f"doc_{i}" for i in range(len(chunks))]
-#     metadatas = [chunk.metadata for chunk in chunks]
-#     embeddings = embedder(texts)
-
-#     collection.add(
-#         ids=ids,
-#         documents=texts,
-#         embeddings=embeddings,
-#         metadatas=metadatas
-#     )
-#     print(f"✅ Successfully stored {len(chunks)} chunks in Chroma DB.")
+    print(f"✅ Successfully stored in FAISS at '{db_path}'")
+    return vectorstore

 def main():
-    print("🔍 Loading documents...")
    docs = load_documents()
-    if not docs:
-        print("⚠️ No files found in 'documents/'. Add some PDFs, TXT, or DOCX.")
-        return
-
-    print(f"📄 Loaded {len(docs)} documents. Splitting into chunks...")
+    if not docs: return
+    
    chunks = chunk_documents(docs)
-    print(f"🧩 Created {len(chunks)} chunks.")
-
-    print("🧠 Generating summaries and tags using local LLM... (this may take a few minutes)")
    enriched_chunks = enrich_chunks(chunks)
-
-    print("💾 Storing in vector database...")
-    store_chunks_with_embeddings_locally(enriched_chunks)
-
+    store_chunks_locally(enriched_chunks)
    print("🎉 Ingestion complete!")

 if __name__ == "__main__":