feat: Working PoC of the Dungeon Masters Vault

This commit is contained in:
2026-01-27 21:24:18 +00:00
parent 645e9461ce
commit 4296a4df88
15 changed files with 347 additions and 563 deletions
+70 -180
View File
@@ -1,228 +1,118 @@
# ingest.py
import os
import json
import dspy
import turso
import requests
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List
from tqdm import tqdm
from langchain_core.embeddings import Embeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from typing import List
from pathlib import Path
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import dspy
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm import tqdm
from embedding import LocalLMEmbeddings
from experts.ingestion_agent import IngestionAgent
# exit()
CHROMA_PATH = "vector_vault"
DATA_DIR = "/home/cosmic/DnD"
def load_documents():
"""
Recursively walk through DATA_DIR and load all .md files as plain text.
Each document gets metadata including source filename and full path.
Ideal for RAG embedding pipelines.
"""
docs = []
# Define loader mapping
loaders = {
".md": TextLoader,
}
data_path = Path(DATA_DIR) # Ensure DATA_DIR is defined elsewhere as a string or Path
data_path = Path(DATA_DIR)
if not data_path.exists() or not data_path.is_dir():
print(f"⚠️ Data directory '{DATA_DIR}' does not exist or is not a directory.")
print(f"⚠️ Data directory '{DATA_DIR}' does not exist.")
return docs
# Walk recursively through all files
for file_path in data_path.rglob("*"):
if file_path.is_file() and file_path.suffix.lower() == ".md":
try:
loader = loaders[file_path.suffix](file_path)
loaded_docs = loader.load()
for file_path in data_path.rglob("*.md"):
try:
loader = TextLoader(str(file_path))
loaded_docs = loader.load()
# Add metadata to each document
for doc in loaded_docs:
doc.metadata["source"] = file_path.name # e.g., "document.md"
doc.metadata["full_path"] = str(file_path) # e.g., "/data/docs/document.md"
for doc in loaded_docs:
# Ensure these keys are set before splitting
doc.metadata["source"] = file_path.name
doc.metadata["full_path"] = str(file_path.absolute())
docs.extend(loaded_docs)
print(f"✅ Loaded: {file_path}") # Remove this line if you want it silent
docs.extend(loaded_docs)
print(f"✅ Loaded: {file_path.name}")
except Exception as e:
print(f"❌ Failed to load {file_path}: {e}")
except Exception as e:
print(f"❌ Failed to load {file_path}: {e}")
print(f"📊 Total documents loaded: {len(docs)}")
return docs
def chunk_documents(docs):
# LangChain preserves metadata during splitting automatically
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=800,
chunk_overlap=100,
chunk_size=800,
chunk_overlap=100,
separators=["\n\n", "\n", ". ", " ", ""]
)
return text_splitter.split_documents(docs)
def enrich_chunks(chunks: List) -> List:
enriched = []
# Define your base model name — the same for all 10 slots
def enrich_chunks(chunks: list) -> list:
MODEL_BASE = "lm_studio/qwen/qwen3-8b"
API_BASE = "http://192.168.0.49:1234/v1/"
dspy.configure(lm=dspy.LM("lm_studio/qwen/qwen3-8b", api_base="http://192.168.0.49:1234/v1/"))
def process_single_chunk(args):
i, chunk = args
lm_index = i % 8
print(f"Processing chunk {i+1}/{len(chunks)} | using model {lm_index}")
def process_single_chunk(indexed_chunk):
idx, chunk = indexed_chunk
lm_index = idx % 8
try:
with dspy.context(lm=dspy.LM(f"{MODEL_BASE}:{lm_index}", api_base = API_BASE)):
response = IngestionAgent().ingest(note=chunk) # ← Uses thread's selected LM!
answer = response.answer
start = answer.find('{')
end = answer.rfind('}') + 1
json_str = answer[start:end]
metadata = json.loads(json_str)
# Configure context for this specific thread
with dspy.context(lm=dspy.LM(f"{MODEL_BASE}:{lm_index}", api_base=API_BASE)):
# Pass the text, but we will update the original chunk object
response = IngestionAgent().ingest(note=chunk.page_content)
answer = response.answer
start = answer.find("{")
end = answer.rfind("}") + 1
metadata_extracted = json.loads(answer[start:end])
# UPDATE: Put AI data in a sub-key to avoid overwriting 'source'
chunk.metadata["enrichment"] = metadata_extracted
# Also flatten tags for easier searching if needed
if "tags" in metadata_extracted:
chunk.metadata["tags"] = metadata_extracted["tags"]
except Exception as e:
print(f"⚠️ Failed to parse JSON for chunk {i}: {e}")
metadata = {"synopsis": "Summary failed", "tags": ["error"]}
# If enrichment fails, we KEEP the chunk but flag the error
# This ensures 'source' and 'full_path' are NEVER lost
chunk.metadata["enrichment_error"] = str(e)
chunk.metadata["tags"] = ["error"]
# Update the chunk's metadata
chunk.metadata.update(metadata)
return chunk
return idx, chunk
# Run 10 parallel workers — each will pick a different model slot
enriched_results = []
with ThreadPoolExecutor(max_workers=8) as executor:
futures = [executor.submit(process_single_chunk, (i, chunk)) for i, chunk in enumerate(chunks)]
# Wrap chunks in enumerate to keep track of order
futures = [executor.submit(process_single_chunk, (i, c)) for i, c in enumerate(chunks)]
for future in tqdm(as_completed(futures), total=len(chunks), desc="Enriching chunks"):
enriched.append(future.result())
enriched_results.append(future.result())
# Restore original order
enriched.sort(key=lambda x: chunks.index(x))
# Sort by the index (first element of tuple) and return only the chunk
enriched_results.sort(key=lambda x: x[0])
return [item[1] for item in enriched_results]
return enriched
class PrecomputedEmbeddings(Embeddings):
def __init__(self, embeddings: List[List[float]]):
self.embeddings = embeddings # Store all precomputed vectors
def embed_documents(self, texts: List[str]) -> List[List[float]]:
return self.embeddings # Return the precomputed ones (order must match!)
def embed_query(self, text):
return self.embeddings[0]
def embedder(texts: List[str]) -> List[List[float]]:
embeddings = []
base_url = "http://192.168.0.49:1234" # ✅ Add 'http://'
embed_url = f"{base_url}/v1/embeddings"
headers = {"Content-Type": "application/json"}
for text in texts:
payload = {
"model": "text-embedding-qwen3-embedding-8b",
"input": text
}
try:
response = requests.post(embed_url, json=payload, headers=headers) # ✅ POST not GET
if response.status_code == 200:
data = response.json() # ✅ Parse JSON!
embedding = data["data"][0]["embedding"] # ✅ Extract the actual vector
embeddings.append(embedding)
else:
print(f"❌ Embedding failed for '{text[:30]}...': {response.status_code} - {response.text}")
# Optionally: insert placeholder zeros if you need to continue
# embeddings.append([0.0] * 768) # ← adjust dimension as needed!
except Exception as e:
print(f"⚠️ Exception embedding '{text[:30]}...': {e}")
# embeddings.append([0.0] * 768) # fallback
return embeddings
def store_chunks_with_embeddings_locally(chunks, db_path="./local_faiss_db"):
"""
Stores pre-computed chunks and their embeddings into a local FAISS database.
Args:
chunks: list of LangChain Document objects (with page_content and metadata)
embeddings: list of embedding vectors (list of lists of floats) — must match length of chunks
db_path: where to save the FAISS index files locally
"""
texts = [chunk.page_content for chunk in chunks]
embeddings = embedder(texts)
if len(chunks) != len(embeddings):
raise ValueError(f"Mismatch! Got {len(chunks)} chunks but {len(embeddings)} embeddings.")
# Create LangChain Document list (we already have this)
documents = chunks # assuming they're already Document objects
# Build FAISS vectorstore using precomputed embeddings
# FAISS.from_embeddings() lets us pass our own embeddings + texts
vectorstore = FAISS.from_embeddings(
text_embeddings=list(zip([doc.page_content for doc in documents], embeddings)),
embedding=PrecomputedEmbeddings(embeddings[0]) # Well define this next
def store_chunks_locally(chunks, db_path="./local_faiss_db"):
embeddings_model = LocalLMEmbeddings(
model="text-embedding-qwen3-embedding-8b",
base_url="http://192.168.0.49:1234",
batch_size=32,
)
# Save to disk
print(f"Index creation started for {len(chunks)} chunks...")
# FAISS.from_documents extracts metadata directly from the Document objects
vectorstore = FAISS.from_documents(documents=chunks, embedding=embeddings_model)
vectorstore.save_local(db_path)
print(f"✅ Successfully stored {len(chunks)} chunks + embeddings into local FAISS DB at '{db_path}'")
# # Store in Turso
# def store_in_turso(chunks):
# ## needs refactor, not using chroma
# client = turso.PersistentClient(path=CHROMA_PATH)
# collection = client.get_or_create_collection("documents")
# ids = [f"doc_{i}" for i in range(len(chunks))]
# metadatas = [chunk.metadata for chunk in chunks]
# embeddings = embedder(texts)
# collection.add(
# ids=ids,
# documents=texts,
# embeddings=embeddings,
# metadatas=metadatas
# )
# print(f"✅ Successfully stored {len(chunks)} chunks in Chroma DB.")
print(f"✅ Successfully stored in FAISS at '{db_path}'")
return vectorstore
def main():
print("🔍 Loading documents...")
docs = load_documents()
if not docs:
print("⚠️ No files found in 'documents/'. Add some PDFs, TXT, or DOCX.")
return
print(f"📄 Loaded {len(docs)} documents. Splitting into chunks...")
if not docs: return
chunks = chunk_documents(docs)
print(f"🧩 Created {len(chunks)} chunks.")
print("🧠 Generating summaries and tags using local LLM... (this may take a few minutes)")
enriched_chunks = enrich_chunks(chunks)
print("💾 Storing in vector database...")
store_chunks_with_embeddings_locally(enriched_chunks)
store_chunks_locally(enriched_chunks)
print("🎉 Ingestion complete!")
if __name__ == "__main__":