From 1e20a5452f150bc9e9c0834ed1f68d07894c6616 Mon Sep 17 00:00:00 2001
From: Jake Pullen <hello@jake-is.me>
Date: Sun, 8 Mar 2026 17:28:29 +0000
Subject: [PATCH] =?UTF-8?q?fix:=20=F0=9F=90=9B=20more=20stable=20ingestion?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ROADMAP.md    | 10 +++++++---
 config.yaml   | 44 +++++++++++++++++++++++++++++---------------
 src/ingest.py |  7 +++++--
 3 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/ROADMAP.md b/ROADMAP.md
index 06c3de4..15c4183 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -11,11 +11,15 @@
 
 ## Planned Next
 
-* AI in the middle - make the llm generate multiple queries for a wider search
+* database retrieve for tag or entity
 
 ## Planned Later
 
 * entity chunking & re-ranking
 * Logging in Ingestion
-* database retrieve for tag or entity
-*
+* More robust ingestion - llm response sometimes out of expected
+
+
+## Done
+
+* AI in the middle - make the llm generate multiple queries for a wider search
diff --git a/config.yaml b/config.yaml
index 6864ca5..80b7e44 100644
--- a/config.yaml
+++ b/config.yaml
@@ -16,7 +16,7 @@ ingestion:
   db_path: "./data/"
   db_name: "dmv.db"
   active_llms: 2
-  parallel_requests_per_llm: 2
+  parallel_requests_per_llm: 4
   chunk_size: 800
   chunk_overlap: 100
   embedding_batch_size: 32
@@ -25,23 +25,37 @@ ingestion:
 # ---- Agent Settings ----
 ingestion_agent:
   ingestion_signature: |
-    You are an expert Dungeon Master's assistant.
-    Analyze the provided notes and extract a concise synopsis and relevant metadata.
-    synopsis = A one-sentence summary of the document.
-    tags = Relevant tags (NPCs, Locations, Items, Plot Points).
-    entities = a list of Key names of people, places, or factions.
-    "note -> synopsis:str, tags: list[str], entities: list[str]"
+    You are an expert Dungeon Master's assistant specialized in campaign note enrichment.
+    Your task is to analyze DnD session notes and extract structured metadata.
+
+    Follow these guidelines:
+    - SYNOPSIS: One concise sentence capturing the key event or development (use active voice)
+    - TAGS: Extract 3-7 relevant tags from: Campaign arcs, NPC names, Locations, Items, Spells, Factions, Plot hooks, Themes
+    - ENTITIES: List all proper nouns (NPCs, locations, organizations) - be specific and consistent with naming
+    The TAGS and ENTITIES must be a list of strings, not json objects
+    Format output as JSON with keys: synopsis, tags, entities
 
 retrieval_agent:
   retrieval_signature: |
-    You are an expert Dungeon Master's assistant.
-    Given the context and the question, answer the question.
-    Do not make things up, base all of your answers on the context.
-    Always site the file location of your source of information.
+    You are an expert Dungeon Master's assistant helping to run a campaign.
+    When answering questions about your DnD world:
+
+    1. Strictly use ONLY the provided context from campaign notes
+    2. If information is incomplete, infer plausibly based on established lore (flag inferences)
+    3. Always cite sources: "Per [filename], [quote/summary]"
+    4. Maintain character voice and narrative style when appropriate
+    5. For rules questions, distinguish between rules-as-written and DM interpretation
+
+    Provide comprehensive answers that help you run the game, including relevant details about NPCs, locations, or plot points.
 
 expansion_agent:
   expansion_signature: |
-    You are a query expansion expert, specialised in Dungeons and Dragons.
-    Given a user's question, generate 3-5 similar but enhanced search queries that would help find more relevant information.
-    Each expanded query should be distinct and add different perspective to the original question.
-    Return only the queries as a JSON list with key "queries"."""
+    You are a query expansion expert specialized in Dungeons & Dragons campaign management.
+
+    Given a user question about their DnD world, generate 3-5 enhanced search queries that:
+    - Cover different aspects (characters, locations, lore, rules)
+    - Include synonyms and related terms (e.g., "dragon" → "wyrm", "scales" → "armor")
+    - Address potential follow-up questions the DM might have
+    - Vary specificity (broad to narrow)
+
+    Return ONLY a JSON array with key "queries". Keep queries concise (5-10 words each).
diff --git a/src/ingest.py b/src/ingest.py
index b12b2b3..f33e432 100644
--- a/src/ingest.py
+++ b/src/ingest.py
@@ -176,8 +176,8 @@ def embed_chunks(chunks: List[Any], batch_size: int = EMBEDDING_BATCH_SIZE) -> L
             print(f"⚠️ Batch processing failed at index {i}: {e}")
             # Fallback: process individually (if needed)
             for j, chunk in enumerate(batch):
+                content = chunk.page_content
                 try:
-                    content = chunk.page_content
                     embedding = embeddings_model.embed_query(content)
 
                     file_path_orig = chunk.metadata.get("full_path", "unknown")
@@ -250,7 +250,10 @@ def save_to_db(chunk_dicts):
                 entry["chunk_data"],
                 entry["synopsis"],
                 ",".join(entry["tags"]),  # Store as comma-separated string
-                ",".join(entry["entities"]),  # Store as comma-separated string
+                ",".join(
+                    str(e) if isinstance(e, str) else e.get("name", str(e))
+                    for e in entry["entities"]
+                ),  # Store as comma-separated string
                 embedding_str,
                 entry["timestamp"],
             )