fix: 🐛 more stable ingestion

Merge pull request #1 from Jake-Pullen/ai_in_the_middle
feat: ✨ AI Powered enhanced queries to get better results
2026-03-08 17:28:29 +00:00 · 2026-03-07 11:22:11 +00:00 · 2026-03-07 11:08:21 +00:00
6 changed files with 108 additions and 51 deletions
@@ -11,9 +11,15 @@

 ## Planned Next

-* AI in the middle - make the llm generate multiple queries for a wider search
+* database retrieve for tag or entity

 ## Planned Later

 * entity chunking & re-ranking
 * Logging in Ingestion
+* More robust ingestion - llm response sometimes out of expected
+
+
+## Done
+
+* AI in the middle - make the llm generate multiple queries for a wider search
@@ -8,13 +8,15 @@ models:
  enrich: "lm_studio/qwen-" # will have an identifier, based on amount of active LLMs see ./load_ingestion_llms.sh
  embedding: "text-embedding-qwen3-embedding-8b"
  retrieval: "lm_studio/qwen/qwen3-30b-a3b-2507"
+  expansion: "lm_studio/qwen/qwen3-30b-a3b-2507"

 # --- Ingestion Settings ---
 ingestion:
-  data_dir: "/home/cosmic/DnD"
-  db_path: "./data/dmv.db"
+  data_dir: "/home/jake/DnD"
+  db_path: "./data/"
+  db_name: "dmv.db"
  active_llms: 2
-  parallel_requests_per_llm: 2
+  parallel_requests_per_llm: 4
  chunk_size: 800
  chunk_overlap: 100
  embedding_batch_size: 32
@@ -23,16 +25,37 @@ ingestion:
 # ---- Agent Settings ----
 ingestion_agent:
  ingestion_signature: |
-    You are an expert Dungeon Master's assistant.
-    Analyze the provided notes and extract a concise synopsis and relevant metadata.
-    synopsis = A one-sentence summary of the document.
-    tags = Relevant tags (NPCs, Locations, Items, Plot Points).
-    entities = a list of Key names of people, places, or factions.
-    "note -> synopsis:str, tags: list[str], entities: list[str]"
+    You are an expert Dungeon Master's assistant specialized in campaign note enrichment.
+    Your task is to analyze DnD session notes and extract structured metadata.
+
+    Follow these guidelines:
+    - SYNOPSIS: One concise sentence capturing the key event or development (use active voice)
+    - TAGS: Extract 3-7 relevant tags from: Campaign arcs, NPC names, Locations, Items, Spells, Factions, Plot hooks, Themes
+    - ENTITIES: List all proper nouns (NPCs, locations, organizations) - be specific and consistent with naming
+    The TAGS and ENTITIES must be a list of strings, not json objects
+    Format output as JSON with keys: synopsis, tags, entities

 retrieval_agent:
  retrieval_signature: |
-    You are an expert Dungeon Master's assistant.
-    Given the context and the question, answer the question.
-    Do not make things up, base all of your answers on the context.
-    Always site the file location of your source of information.
+    You are an expert Dungeon Master's assistant helping to run a campaign.
+    When answering questions about your DnD world:
+
+    1. Strictly use ONLY the provided context from campaign notes
+    2. If information is incomplete, infer plausibly based on established lore (flag inferences)
+    3. Always cite sources: "Per [filename], [quote/summary]"
+    4. Maintain character voice and narrative style when appropriate
+    5. For rules questions, distinguish between rules-as-written and DM interpretation
+
+    Provide comprehensive answers that help you run the game, including relevant details about NPCs, locations, or plot points.
+
+expansion_agent:
+  expansion_signature: |
+    You are a query expansion expert specialized in Dungeons & Dragons campaign management.
+
+    Given a user question about their DnD world, generate 3-5 enhanced search queries that:
+    - Cover different aspects (characters, locations, lore, rules)
+    - Include synonyms and related terms (e.g., "dragon" → "wyrm", "scales" → "armor")
+    - Address potential follow-up questions the DM might have
+    - Vary specificity (broad to narrow)
+
+    Return ONLY a JSON array with key "queries". Keep queries concise (5-10 words each).
@@ -1,5 +1,6 @@
 import requests
 from langchain_core.embeddings import Embeddings
+
 from config_loader import load_config

 CFG = load_config()
@@ -37,7 +38,7 @@ class LocalLMEmbeddings(Embeddings):

        for i in range(0, len(texts), self.batch_size):
            batch = texts[i : i + self.batch_size]
-            print(f"🚀 Processing batch {(i // self.batch_size) + 1} (Size: {len(batch)})...")
+            # print(f"🚀 Processing batch {(i // self.batch_size) + 1} (Size: {len(batch)})...")

            batch_vectors = self._post_request(batch)
            all_embeddings.extend(batch_vectors)
@@ -1,7 +1,7 @@
 import os
-import turso
-import dspy

+import dspy
+import turso

 from config_loader import load_config
 from embedding import LocalLMEmbeddings
@@ -9,27 +9,28 @@ from embedding import LocalLMEmbeddings
 CFG = load_config()

 DATABASE_PATH = CFG["ingestion"]["db_path"]
+DATABASE_NAME = CFG["ingestion"]["db_name"]
 EMBEDDING_MODEL = CFG["models"]["embedding"]
 API_BASE = CFG["api"]["base_url"]
 RETRIEVAL_CONFIG = CFG["retrieval_agent"]
+EXPANSION_CONFIG = CFG["expansion_agent"]


 def retrieve_from_turso(embedded_question, k=5):
    query = f"""
    SELECT file_path, synopsis, tags, entities, chunk_data,
-    vector_distance_cos(embedding, vector32('{embedded_question[0]}')) AS distance
+    vector_distance_cos(embedding, vector32('{embedded_question}')) AS distance
    FROM notes
    ORDER BY distance ASC
    LIMIT {k};
    """
-    con = turso.connect(DATABASE_PATH)
+    con = turso.connect(DATABASE_PATH + DATABASE_NAME)
    cur = con.cursor()
    cur.execute(query)
    rows = cur.fetchall()
    return rows


-# --- DSPy Signature ---
 class DnDContextQA(dspy.Signature):
    f"{RETRIEVAL_CONFIG['retrieval_signature']}"

@@ -38,47 +39,71 @@ class DnDContextQA(dspy.Signature):
    answer = dspy.OutputField(desc="A detailed answer based on the notes, citing the source file.")


+class ExpansionSignature(dspy.Signature):
+    f"{EXPANSION_CONFIG['expansion_signature']}"
+    question = dspy.InputField()
+    answer = dspy.OutputField(
+        desc="A list of questions that will be used to vector search the database."
+    )
+
+
 class DnDRAG(dspy.Module):
    def __init__(self):
        super().__init__()
        self.embeddings_model = LocalLMEmbeddings(
            model=EMBEDDING_MODEL,
            base_url=API_BASE,
-            batch_size=1,  # we only send 1 question at a time.
+            # batch_size=1,
        )
-        # Tools exposed to the ReAct loop
+        self.retrieval_lm = dspy.LM(
+            model=CFG["models"]["retrieval"], api_base=API_BASE + CFG["api"]["api_version"]
+        )
+        with dspy.context(lm=self.retrieval_lm, signature=ExpansionSignature):
+            self.query_expander = dspy.Predict("question -> queries:list[str]")
+
        self.tools = [self.load_file]
        self.generate_answer = dspy.ReAct(signature=DnDContextQA, tools=self.tools)

    def forward(self, question):
-        # TODO: Add step here to LLM Expand
-        # given the current question, generate 3-5 distinct search queries.
-        # embed all the questions
-        embedded_question = self.embeddings_model._post_request(question)
-        # store the 5 from all 3-5 questions (15 - 25 results)
-        results = retrieve_from_turso(embedded_question, k=5)  # k is limit to return
+        print("Enhancing Question")
+        with dspy.context(lm=self.retrieval_lm):
+            expanded_queries = self.query_expander(question=question).queries
+        print("Enhanced Queries:")
+        for q in expanded_queries:
+            print("    ", q)
+        all_embeddings = self.embeddings_model.embed_documents([question] + expanded_queries)
+        # print(all_embeddings)
+        all_results = []
+        for embedded_question in all_embeddings:
+            results = retrieve_from_turso(embedded_question, k=5)
+            all_results.extend(results)
+
+        seen = set()
+        unique_results = []
+        for row in all_results:
+            key = (row[0], row[4])
+            if key not in seen:
+                seen.add(key)
+                unique_results.append(row)

-        # Format context as before
        context_parts = []
-        for i, row in enumerate(results):
-            source = row[0]  # file_path
-            synopsis = row[1]  # synopsis
-            tags = row[2]  # tags
-            entities = row[3]  # entities
-            content = row[4]  # chunk_data
+        for i, row in enumerate(unique_results):
+            source = row[0]
+            synopsis = row[1]
+            tags = row[2]
+            entities = row[3]
+            content = row[4]
+            closeness = row[5]

            context_parts.append(f"""
 --- Chunk {i + 1} from {source} ---
 synopsis: {synopsis},
 tags: {tags},
-entities: {entities}
+entities: {entities},
+closeness: {closeness},
 {content}
 """)

-        # print('Closest embedding hits')
-        # for part in context_parts:
-        #     print(part)
-
        context = "\n\n".join(context_parts)

        prediction = self.generate_answer(context=context, question=question)
@@ -16,6 +16,7 @@ from experts.ingestion_agent import IngestionAgent
 CFG = load_config()
 DATA_DIR = CFG["ingestion"]["data_dir"]
 DATABASE_PATH = CFG["ingestion"]["db_path"]
+DATABASE_NAME = CFG["ingestion"]["db_name"]
 MODEL_BASE = CFG["models"]["enrich"]
 EMBEDDING_MODEL = CFG["models"]["embedding"]
 API_BASE = CFG["api"]["base_url"]
@@ -139,13 +140,10 @@ def embed_chunks(chunks: List[Any], batch_size: int = EMBEDDING_BATCH_SIZE) -> L
    # Process chunks in batches
    for i in tqdm(range(0, total_chunks, batch_size), desc="Embedding batches"):
        batch = chunks[i : i + batch_size]
+        print(f"🚀 Processing batch {(i // batch_size) + 1} (Size: {len(batch)})...")
        batch_content = [chunk.page_content for chunk in batch]
-
        try:
-            # Use model's batched embedding method
-            # batch_embeddings = embeddings_model.embed_query(batch_content)
            batch_embeddings = embeddings_model.embed_documents(batch_content)
-
            # Process each chunk in the batch
            for j, (chunk, embedding) in enumerate(zip(batch, batch_embeddings)):
                # Extract metadata
@@ -178,8 +176,8 @@ def embed_chunks(chunks: List[Any], batch_size: int = EMBEDDING_BATCH_SIZE) -> L
            print(f"⚠️ Batch processing failed at index {i}: {e}")
            # Fallback: process individually (if needed)
            for j, chunk in enumerate(batch):
-                try:
                content = chunk.page_content
+                try:
                    embedding = embeddings_model.embed_query(content)

                    file_path_orig = chunk.metadata.get("full_path", "unknown")
@@ -228,7 +226,7 @@ def save_to_db(chunk_dicts):
    Each dict maps to a row in the 'notes' table.
    """
    print("connecting to db")
-    con = turso.connect(DATABASE_PATH)
+    con = turso.connect(DATABASE_PATH + DATABASE_NAME)
    print("opening cursor")
    cur = con.cursor()

@@ -252,7 +250,10 @@ def save_to_db(chunk_dicts):
                entry["chunk_data"],
                entry["synopsis"],
                ",".join(entry["tags"]),  # Store as comma-separated string
-                ",".join(entry["entities"]),  # Store as comma-separated string
+                ",".join(
+                    str(e) if isinstance(e, str) else e.get("name", str(e))
+                    for e in entry["entities"]
+                ),  # Store as comma-separated string
                embedding_str,
                entry["timestamp"],
            )
@@ -267,7 +268,8 @@ def save_to_db(chunk_dicts):


 def create_db():
-    con = turso.connect(DATABASE_PATH)
+    Path(DATABASE_PATH).mkdir(exist_ok=True)
+    con = turso.connect(DATABASE_PATH + DATABASE_NAME)
    cur = con.cursor()

    cur.execute("""
@@ -334,7 +336,7 @@ def delete_from_db(embedded_chunks):

    print(f"Deleting existing rows for {len(file_paths)} file(s)")

-    con = turso.connect(DATABASE_PATH)
+    con = turso.connect(DATABASE_PATH + DATABASE_NAME)
    cur = con.cursor()

    # Use a single DELETE statement with IN clause for efficiency
Author	SHA1	Message	Date
Jake	1e20a5452f	fix: 🐛 more stable ingestion	2026-03-08 17:28:29 +00:00
Jake-Pullen	90c88b068b	Merge pull request #1 from Jake-Pullen/ai_in_the_middle feat: ✨ AI Powered enhanced queries to get better results	2026-03-07 11:22:11 +00:00
Jake	26c0049fd8	feat: ✨ AI Powered enhanced queries to get better results	2026-03-07 11:08:21 +00:00