added toon-python submodule

2026-05-10 14:06:18 +01:00
5 changed files with 24 additions and 41 deletions
@@ -0,0 +1,3 @@
 [submodule "toon-python"]
 	path = toon-python
 	url = git@github.com:toon-format/toon-python.git
@@ -11,15 +11,11 @@
 ## Planned Next
-* database retrieve for tag or entity
+* AI in the middle - make the llm generate multiple queries for a wider search
 ## Planned Later
 * entity chunking & re-ranking
 * Logging in Ingestion
-* More robust ingestion - llm response sometimes out of expected
+* database retrieve for tag or entity
-
+*
 ## Done
 * AI in the middle - make the llm generate multiple queries for a wider search
@@ -16,7 +16,7 @@ ingestion:
  db_path: "./data/"
  db_name: "dmv.db"
  active_llms: 2
-  parallel_requests_per_llm: 4
+  parallel_requests_per_llm: 2
  chunk_size: 800
  chunk_overlap: 100
  embedding_batch_size: 32
@@ -25,37 +25,23 @@ ingestion:
 # ---- Agent Settings ----
 ingestion_agent:
  ingestion_signature: |
-    You are an expert Dungeon Master's assistant specialized in campaign note enrichment.
+    You are an expert Dungeon Master's assistant.
-    Your task is to analyze DnD session notes and extract structured metadata.
+    Analyze the provided notes and extract a concise synopsis and relevant metadata.
-
+    synopsis = A one-sentence summary of the document.
-    Follow these guidelines:
+    tags = Relevant tags (NPCs, Locations, Items, Plot Points).
-    - SYNOPSIS: One concise sentence capturing the key event or development (use active voice)
+    entities = a list of Key names of people, places, or factions.
-    - TAGS: Extract 3-7 relevant tags from: Campaign arcs, NPC names, Locations, Items, Spells, Factions, Plot hooks, Themes
+    "note -> synopsis:str, tags: list[str], entities: list[str]"
    - ENTITIES: List all proper nouns (NPCs, locations, organizations) - be specific and consistent with naming
    The TAGS and ENTITIES must be a list of strings, not json objects
    Format output as JSON with keys: synopsis, tags, entities
 retrieval_agent:
  retrieval_signature: |
-    You are an expert Dungeon Master's assistant helping to run a campaign.
+    You are an expert Dungeon Master's assistant.
-    When answering questions about your DnD world:
+    Given the context and the question, answer the question.
-
+    Do not make things up, base all of your answers on the context.
-    1. Strictly use ONLY the provided context from campaign notes
+    Always site the file location of your source of information.
    2. If information is incomplete, infer plausibly based on established lore (flag inferences)
    3. Always cite sources: "Per [filename], [quote/summary]"
    4. Maintain character voice and narrative style when appropriate
    5. For rules questions, distinguish between rules-as-written and DM interpretation
    Provide comprehensive answers that help you run the game, including relevant details about NPCs, locations, or plot points.
 expansion_agent:
  expansion_signature: |
-    You are a query expansion expert specialized in Dungeons & Dragons campaign management.
+    You are a query expansion expert, specialised in Dungeons and Dragons.
-
+    Given a user's question, generate 3-5 similar but enhanced search queries that would help find more relevant information.
-    Given a user question about their DnD world, generate 3-5 enhanced search queries that:
+    Each expanded query should be distinct and add different perspective to the original question.
-    - Cover different aspects (characters, locations, lore, rules)
+    Return only the queries as a JSON list with key "queries"."""
    - Include synonyms and related terms (e.g., "dragon" → "wyrm", "scales" → "armor")
    - Address potential follow-up questions the DM might have
    - Vary specificity (broad to narrow)
    Return ONLY a JSON array with key "queries". Keep queries concise (5-10 words each).
@@ -176,8 +176,8 @@ def embed_chunks(chunks: List[Any], batch_size: int = EMBEDDING_BATCH_SIZE) -> L
            print(f"⚠️ Batch processing failed at index {i}: {e}")
            # Fallback: process individually (if needed)
            for j, chunk in enumerate(batch):
                content = chunk.page_content
                try:
                    content = chunk.page_content
                    embedding = embeddings_model.embed_query(content)
                    file_path_orig = chunk.metadata.get("full_path", "unknown")
@@ -250,10 +250,7 @@ def save_to_db(chunk_dicts):
                entry["chunk_data"],
                entry["synopsis"],
                ",".join(entry["tags"]),  # Store as comma-separated string
-                ",".join(
+                ",".join(entry["entities"]),  # Store as comma-separated string
                    str(e) if isinstance(e, str) else e.get("name", str(e))
                    for e in entry["entities"]
                ),  # Store as comma-separated string
                embedding_str,
                entry["timestamp"],
            )