feat: ✨ AI Read File Tool, Configurable system prompts and loading lots of llms

2026-03-04 15:48:25 +00:00
parent bbaebf1f70
commit 0d0e747682
10 changed files with 184 additions and 47 deletions
@@ -1,4 +1,5 @@
 data/*
+*.log

 # Python-generated files
 __pycache__/
@@ -1,8 +1,10 @@
-Read File Tool for Retrieve Agent
+---Read File Tool for Retrieve Agent---

-Easy Config of system prompts 
+---Easy Config of system prompts---
    examples into prompts & better prompts

+LMS CLI script to load multiple models and to make each model accept multiple inferences
+
 context engineering, - only include vector hits that are x distance?

 AI in the middle - make the ai generate the string for vector search
@@ -19,7 +21,3 @@ QA specific embedding models?

 Evaluation metrics, how good is it doing? 
    rate my response!? 
-examples into prompts & better prompts
-
-common model attributes - temp & top-k 
-
@@ -0,0 +1,10 @@
+lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-0" --ttl 1800
+lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-1" --ttl 1800
+lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-2" --ttl 1800
+lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-3" --ttl 1800
+lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-4" --ttl 1800
+lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-5" --ttl 1800
+lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-6" --ttl 1800
+lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-7" --ttl 1800
+lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-8" --ttl 1800
+lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-9" --ttl 1800
@@ -5,7 +5,7 @@ api:

 # --- Model Settings ---
 models:
-  enrich: "lm_studio/qwen/qwen3-8b"
+  enrich: "lm_studio/qwen-"
  embedding: "text-embedding-qwen3-embedding-8b"
  retrieval: "lm_studio/qwen/qwen3-30b-a3b-2507"

@@ -13,13 +13,26 @@ models:
 ingestion:
  data_dir: "/home/devin/DnD"
  db_path: "./data/dmv.db"
-  max_workers: 8
+  active_llms: 10
+  parallel_requests_per_llm: 4
  chunk_size: 800
  chunk_overlap: 100
  embedding_batch_size: 32
  time_file_location: "./data/time_file.txt"

-# --- Retrieval Settings ---
-retrieval:
-  top_k: 4
-  context_limit: 10000  # Max characters from full file context
+# ---- Agent Settings ----
+ingestion_agent: 
+  ingestion_signature: |
+    You are an expert Dungeon Master's assistant.
+    Analyze the provided notes and extract a concise synopsis and relevant metadata.
+    synopsis = A one-sentence summary of the document.
+    tags = Relevant tags (NPCs, Locations, Items, Plot Points).
+    entities = a list of Key names of people, places, or factions.
+    "note -> synopsis:str, tags: list[str], entities: list[str]"
+
+retrieval_agent:
+  retrieval_signature: |
+    You are an expert Dungeon Master's assistant.
+    Given the context and the question, answer the question.
+    Do not make things up, base all of your answers on the context.
+    Always site your sources
@@ -1,12 +1,16 @@
 import requests
 from langchain_core.embeddings import Embeddings
+from config_loader import load_config

+CFG = load_config()
+API_BASE = CFG["api"]["base_url"]
+API_VERSION = CFG["api"]["api_version"]

 class LocalLMEmbeddings(Embeddings):
    def __init__(
-        self, model: str, base_url: str = "http://192.168.0.49:1234", batch_size: int = 32
+        self, model: str, base_url: str = API_BASE, batch_size: int = 32
    ):
-        self.url = f"{base_url}/v1/embeddings"
+        self.url = f"{base_url}/{API_VERSION}/embeddings"
        self.model = model
        self.batch_size = batch_size

@@ -27,7 +31,7 @@ class LocalLMEmbeddings(Embeddings):
            return [[] for _ in input_texts]

    def embed_documents(self, texts: list[str]) -> list[list[float]]:
-        """Splits 500+ chunks into batches of 32 and processes them."""
+        """Splits chunks into batches of 32 and processes them."""
        all_embeddings = []

        for i in range(0, len(texts), self.batch_size):
@@ -1,21 +1,16 @@
 import dspy
 from typing import List
+from config_loader import load_config
+
+CFG = load_config()
+INGESTION_CONFIG = CFG["ingestion_agent"]

 class IngestionSignature(dspy.Signature):
-    """You are an expert Dungeon Master's assistant.
-    Analyze the provided notes and extract a concise synopsis and relevant metadata.
-    synopsis = A one-sentence summary of the document.
-    tags = Relevant tags (NPCs, Locations, Items, Plot Points).
-    entities = Key names of people, places, or factions.
-    "note -> synopsis:str, tags: list[str], entities: list[str]"
-    /no_think
-    """
+    f"{INGESTION_CONFIG["ingestion_signature"]}"

    note: str = dspy.InputField(desc="The DM notes or session recap content.")
    answer: dict[str,str|List] = dspy.OutputField(desc="the metadata dictionary with the keys; synopsis, tags, entities")

-
-
 class IngestionAgent(dspy.Module):
    def __init__(self):
        self.ingest = dspy.Predict(IngestionSignature)
@@ -1,7 +1,7 @@
-# from pathlib import Path
+import os
 import turso
 import dspy
-#  from langchain_community.vectorstores import FAISS
+

 from config_loader import load_config
 from embedding import LocalLMEmbeddings
@@ -11,11 +11,10 @@ CFG = load_config()
 DATABASE_PATH = CFG["ingestion"]["db_path"]
 EMBEDDING_MODEL = CFG["models"]["embedding"]
 API_BASE = CFG["api"]["base_url"]
+RETRIEVAL_CONFIG = CFG["retrieval_agent"]
+

-# Inside your retrieval logic:
 def retrieve_from_turso(embedded_question, k=5):
-    # Example query: search for relevant notes using full-text search or embedding similarity
-    # Note: Turso supports SQLite, so you can use FTS5 or a vector extension if available
    query = f"""
    SELECT file_path, synopsis, tags, entities, chunk_data,
    vector_distance_cos(embedding, vector32('{embedded_question[0]}')) AS distance
@@ -31,8 +30,7 @@ def retrieve_from_turso(embedded_question, k=5):

 # --- DSPy Signature ---
 class DnDContextQA(dspy.Signature):
-    """Answer DnD campaign questions using provided details.
-    """
+    f"{RETRIEVAL_CONFIG["retrieval_signature"]}"

    context = dspy.InputField(
        desc="Relevant chunks and metadata from the campaign notes."
@@ -49,7 +47,11 @@ class DnDRAG(dspy.Module):
            base_url=API_BASE,
            batch_size=1, # we only send 1 question at a time.
            )
-        self.generate_answer = dspy.ChainOfThought(DnDContextQA)
+                # Tools exposed to the ReAct loop
+        self.tools = [
+            self.load_file
+        ]
+        self.generate_answer = dspy.ReAct(signature=DnDContextQA,tools=self.tools)

    def forward(self, question):
        # Use Turso to retrieve relevant notes
@@ -73,12 +75,23 @@ tags: {tags},
 entities: {entities}
 {content}
 """)
-            
-        print('Closest embedding hits')
-        for part in context_parts:
-            print(part)
+
+        # print('Closest embedding hits')
+        # for part in context_parts:
+        #     print(part)
        
        context = "\n\n".join(context_parts)

        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(answer=prediction.answer, context=context)
+
+    def load_file(self, file_path) -> str | None:
+        """Load and return specified file."""
+        if os.path.exists(file_path):
+            try:
+                with open(file_path) as file:
+                    return file.read()
+            except Exception:
+                return None
+        else:
+            return None
@@ -19,7 +19,9 @@ MODEL_BASE = CFG["models"]["enrich"]
 EMBEDDING_MODEL = CFG["models"]["embedding"]
 API_BASE = CFG["api"]["base_url"]
 API_VERSION = CFG["api"]["api_version"]
-MAX_WORKERS = CFG["ingestion"]["max_workers"]
+# MAX_WORKERS = CFG["ingestion"]["max_workers"]
+ACTIVE_LLMS = CFG["ingestion"]["active_llms"]
+PARALLEL_REQUESTS_PER_LLM = CFG["ingestion"]["parallel_requests_per_llm"]
 CHUNK_SIZE = CFG["ingestion"]["chunk_size"]
 CHUNK_OVERLAP = CFG["ingestion"]["chunk_overlap"]
 EMBEDDING_BATCH_SIZE = CFG["ingestion"]["embedding_batch_size"]
@@ -75,10 +77,10 @@ def chunk_documents(docs):
 def enrich_chunks(chunks: list) -> list:
    def process_single_chunk(indexed_chunk):
        idx, chunk = indexed_chunk
-        lm_index = idx % 8
+        lm_index = idx % ACTIVE_LLMS

        try:
-            with dspy.context(lm=dspy.LM(model=MODEL_BASE, api_base=API_BASE + API_VERSION)):
+            with dspy.context(lm=dspy.LM(model=f"{MODEL_BASE}{lm_index}", api_base=API_BASE + API_VERSION), chat_template_kwargs={"enable_thinking": False}):
                response = IngestionAgent().ingest(note=chunk.page_content)

                # This is now an object, not a string!
@@ -92,7 +94,7 @@ def enrich_chunks(chunks: list) -> list:
        return (idx, chunk)

    enriched_results = []
-    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+    with ThreadPoolExecutor(max_workers=PARALLEL_REQUESTS_PER_LLM*ACTIVE_LLMS) as executor:
        # Wrap chunks in enumerate to keep track of order
        futures = [executor.submit(process_single_chunk, (i, c)) for i, c in enumerate(chunks)]

@@ -1,16 +1,93 @@
 import sys
 import dspy
-# import turso
+import logging
+from dspy.utils.callback import BaseCallback
+
+from logging.handlers import RotatingFileHandler

 from config_loader import load_config
-from experts.dnd_agent import DnDRAG
+from experts.retrieval_agent import DnDRAG

 CFG = load_config()
 RETRIEVE_MODEL = CFG["models"]["retrieval"]
 API_BASE = CFG["api"]["base_url"]
 API_VERSION = CFG["api"]["api_version"]

+class CallbackHandler(BaseCallback):
+    """Custom callback class for logging agent interactions."""
+
+    def __init__(self, logger):
+        """Initialize the callback with a logger instance."""
+        super().__init__()
+        self.logger = logger
+
+    def on_module_end(self, call_id, outputs, exception):
+        """Handle module end events for logging."""
+        step = "Reasoning" if self._is_reasoning_output(outputs) else "Acting"
+        self.logger.debug(f"== {step} Step ===")
+        for k, v in outputs.items():
+            self.logger.debug(f"  {k}: {v}")
+
+    def on_lm_start(self, call_id, instance, inputs):
+        """Handle language model start events for logging."""
+        self.logger.debug(f"LM is called with inputs: {inputs}")
+
+    def on_tool_start(self, call_id, instance, inputs):
+        """Handle tool start events for logging."""
+        self.logger.debug(f"Tool {instance} called with inputs: {inputs}")
+
+    def on_tool_end(self, call_id, outputs, exception):
+        """Handle tool end events for logging."""
+        self.logger.debug(f"Tool finished with outputs: {outputs}")
+
+    def on_lm_end(self, call_id, outputs, exception):
+        """Handle language model end events for logging."""
+        self.logger.debug(f"LM is finished with outputs: {outputs}")
+
+    def _is_reasoning_output(self, outputs):
+        return any(k.startswith("Thought") for k in outputs)
+
+def setup_logging():
+    """Set up logging configuration for Merlin."""
+    # Create a custom logger
+    logger = logging.getLogger(__name__)
+
+    # Set the minimum level for the logger
+    logger.setLevel(logging.DEBUG)
+
+    # Create a console handler
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+
+    # Create a file handler with rotation every 5MB
+    file_handler = RotatingFileHandler(
+        "dmv.log", maxBytes=5 * 1024 * 1024, backupCount=3
+    )
+    file_handler.setLevel(logging.DEBUG)
+
+    # Create a formatter
+    formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+
+    # Set the formatter for the handler
+    console_handler.setFormatter(formatter)
+    file_handler.setFormatter(formatter)
+
+    # Add the handler to the logger
+    logger.addHandler(console_handler)
+    logger.addHandler(file_handler)
+
+    return logger
+
+
 def main():
+    logger = setup_logging()
+    logger.debug("main application started")
+
+    # Add verbose callback
+    dspy.configure(verbose_errors=True)
+    dspy.configure(callbacks=[CallbackHandler(logger)])
    # 1. Setup the LLM
    print("🚀 Initializing Qwen-8B via LM Studio...")
    lm = dspy.LM(RETRIEVE_MODEL, api_base=API_BASE + API_VERSION)
@@ -32,7 +109,7 @@ def main():
            query = input("📝 Query: ").strip()

            # Exit conditions
-            if query.lower() in ["exit", "quit", "q"]:
+            if query.lower() in ["exit", "quit", "q", "bye"]:
                print("Farewell, traveler. Good luck on your quest!")
                break

@@ -47,11 +124,10 @@ def main():
            print(response.answer)

        except KeyboardInterrupt:
-            print("\n\nExiting... See you next session!")
+            print("\n\nRude?!.... Exiting...")
            sys.exit(0)
        except Exception as e:
            print(f"\n⚠️ An error occurred: {e}")

-
 if __name__ == "__main__":
-    main()
+    main()
@@ -0,0 +1,25 @@
+import dspy
+
+base_url= "http://framework.tawny-bellatrix.ts.net:1234"
+model_name= "lm_studio/qwen-0"
+
+
+lm = dspy.LM(
+    model=model_name, 
+    api_base=f"{base_url}/v1/"
+)
+dspy.configure(lm=lm)
+
+
+# question = "How can i use dspy framework to add 'chat_template_kwargs={\"enable_thinking\": False}' to my API call to LM Studio? i know it uses litellm under the hood"
+# question = "Hi there, do you have a name? if not i want you to name yourself."
+question = "how long would it take light to travel from the sun to the earth? /no_think"
+
+# Call with request_kwargs to inject the template kwargs
+response = lm(
+    messages=[{"role": "user", "content": question}]
+    # extra_body={"enable_thinking": False} 
+    # enable_thinking=False
+)
+
+print(response)