From 1e20a5452f150bc9e9c0834ed1f68d07894c6616 Mon Sep 17 00:00:00 2001 From: Jake Pullen Date: Sun, 8 Mar 2026 17:28:29 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20=F0=9F=90=9B=20more=20stable=20ingestion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ROADMAP.md | 10 +++++++--- config.yaml | 44 +++++++++++++++++++++++++++++--------------- src/ingest.py | 7 +++++-- 3 files changed, 41 insertions(+), 20 deletions(-) diff --git a/ROADMAP.md b/ROADMAP.md index 06c3de4..15c4183 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -11,11 +11,15 @@ ## Planned Next -* AI in the middle - make the llm generate multiple queries for a wider search +* database retrieve for tag or entity ## Planned Later * entity chunking & re-ranking * Logging in Ingestion -* database retrieve for tag or entity -* +* More robust ingestion - llm response sometimes out of expected + + +## Done + +* AI in the middle - make the llm generate multiple queries for a wider search diff --git a/config.yaml b/config.yaml index 6864ca5..80b7e44 100644 --- a/config.yaml +++ b/config.yaml @@ -16,7 +16,7 @@ ingestion: db_path: "./data/" db_name: "dmv.db" active_llms: 2 - parallel_requests_per_llm: 2 + parallel_requests_per_llm: 4 chunk_size: 800 chunk_overlap: 100 embedding_batch_size: 32 @@ -25,23 +25,37 @@ ingestion: # ---- Agent Settings ---- ingestion_agent: ingestion_signature: | - You are an expert Dungeon Master's assistant. - Analyze the provided notes and extract a concise synopsis and relevant metadata. - synopsis = A one-sentence summary of the document. - tags = Relevant tags (NPCs, Locations, Items, Plot Points). - entities = a list of Key names of people, places, or factions. - "note -> synopsis:str, tags: list[str], entities: list[str]" + You are an expert Dungeon Master's assistant specialized in campaign note enrichment. + Your task is to analyze DnD session notes and extract structured metadata. + + Follow these guidelines: + - SYNOPSIS: One concise sentence capturing the key event or development (use active voice) + - TAGS: Extract 3-7 relevant tags from: Campaign arcs, NPC names, Locations, Items, Spells, Factions, Plot hooks, Themes + - ENTITIES: List all proper nouns (NPCs, locations, organizations) - be specific and consistent with naming + The TAGS and ENTITIES must be a list of strings, not json objects + Format output as JSON with keys: synopsis, tags, entities retrieval_agent: retrieval_signature: | - You are an expert Dungeon Master's assistant. - Given the context and the question, answer the question. - Do not make things up, base all of your answers on the context. - Always site the file location of your source of information. + You are an expert Dungeon Master's assistant helping to run a campaign. + When answering questions about your DnD world: + + 1. Strictly use ONLY the provided context from campaign notes + 2. If information is incomplete, infer plausibly based on established lore (flag inferences) + 3. Always cite sources: "Per [filename], [quote/summary]" + 4. Maintain character voice and narrative style when appropriate + 5. For rules questions, distinguish between rules-as-written and DM interpretation + + Provide comprehensive answers that help you run the game, including relevant details about NPCs, locations, or plot points. expansion_agent: expansion_signature: | - You are a query expansion expert, specialised in Dungeons and Dragons. - Given a user's question, generate 3-5 similar but enhanced search queries that would help find more relevant information. - Each expanded query should be distinct and add different perspective to the original question. - Return only the queries as a JSON list with key "queries".""" + You are a query expansion expert specialized in Dungeons & Dragons campaign management. + + Given a user question about their DnD world, generate 3-5 enhanced search queries that: + - Cover different aspects (characters, locations, lore, rules) + - Include synonyms and related terms (e.g., "dragon" → "wyrm", "scales" → "armor") + - Address potential follow-up questions the DM might have + - Vary specificity (broad to narrow) + + Return ONLY a JSON array with key "queries". Keep queries concise (5-10 words each). diff --git a/src/ingest.py b/src/ingest.py index b12b2b3..f33e432 100644 --- a/src/ingest.py +++ b/src/ingest.py @@ -176,8 +176,8 @@ def embed_chunks(chunks: List[Any], batch_size: int = EMBEDDING_BATCH_SIZE) -> L print(f"⚠️ Batch processing failed at index {i}: {e}") # Fallback: process individually (if needed) for j, chunk in enumerate(batch): + content = chunk.page_content try: - content = chunk.page_content embedding = embeddings_model.embed_query(content) file_path_orig = chunk.metadata.get("full_path", "unknown") @@ -250,7 +250,10 @@ def save_to_db(chunk_dicts): entry["chunk_data"], entry["synopsis"], ",".join(entry["tags"]), # Store as comma-separated string - ",".join(entry["entities"]), # Store as comma-separated string + ",".join( + str(e) if isinstance(e, str) else e.get("name", str(e)) + for e in entry["entities"] + ), # Store as comma-separated string embedding_str, entry["timestamp"], )