feat: ✨ AI Read File Tool, Configurable system prompts and loading lots of llms

2026-03-04 15:48:25 +00:00
parent bbaebf1f70
commit 0d0e747682
10 changed files with 184 additions and 47 deletions
@@ -1,4 +1,5 @@
 data/*
 *.log
 # Python-generated files
 __pycache__/
@@ -1,8 +1,10 @@
-Read File Tool for Retrieve Agent
+---Read File Tool for Retrieve Agent---
-Easy Config of system prompts 
+---Easy Config of system prompts---
    examples into prompts & better prompts
 LMS CLI script to load multiple models and to make each model accept multiple inferences
 context engineering, - only include vector hits that are x distance?
 AI in the middle - make the ai generate the string for vector search
@@ -19,7 +21,3 @@ QA specific embedding models?
 Evaluation metrics, how good is it doing? 
    rate my response!? 
 examples into prompts & better prompts
 common model attributes - temp & top-k 
@@ -0,0 +1,10 @@
 lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-0" --ttl 1800
 lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-1" --ttl 1800
 lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-2" --ttl 1800
 lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-3" --ttl 1800
 lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-4" --ttl 1800
 lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-5" --ttl 1800
 lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-6" --ttl 1800
 lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-7" --ttl 1800
 lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-8" --ttl 1800
 lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-9" --ttl 1800
@@ -5,7 +5,7 @@ api:
 # --- Model Settings ---
 models:
-  enrich: "lm_studio/qwen/qwen3-8b"
+  enrich: "lm_studio/qwen-"
  embedding: "text-embedding-qwen3-embedding-8b"
  retrieval: "lm_studio/qwen/qwen3-30b-a3b-2507"
@@ -13,13 +13,26 @@ models:
 ingestion:
  data_dir: "/home/devin/DnD"
  db_path: "./data/dmv.db"
-  max_workers: 8
+  active_llms: 10
  parallel_requests_per_llm: 4
  chunk_size: 800
  chunk_overlap: 100
  embedding_batch_size: 32
  time_file_location: "./data/time_file.txt"
-# --- Retrieval Settings ---
+# ---- Agent Settings ----
-retrieval:
+ingestion_agent: 
-  top_k: 4
+  ingestion_signature: |
-  context_limit: 10000  # Max characters from full file context
+    You are an expert Dungeon Master's assistant.
    Analyze the provided notes and extract a concise synopsis and relevant metadata.
    synopsis = A one-sentence summary of the document.
    tags = Relevant tags (NPCs, Locations, Items, Plot Points).
    entities = a list of Key names of people, places, or factions.
    "note -> synopsis:str, tags: list[str], entities: list[str]"
 retrieval_agent:
  retrieval_signature: |
    You are an expert Dungeon Master's assistant.
    Given the context and the question, answer the question.
    Do not make things up, base all of your answers on the context.
    Always site your sources
@@ -1,12 +1,16 @@
 import requests
 from langchain_core.embeddings import Embeddings
 from config_loader import load_config
 CFG = load_config()
 API_BASE = CFG["api"]["base_url"]
 API_VERSION = CFG["api"]["api_version"]
 class LocalLMEmbeddings(Embeddings):
    def __init__(
-        self, model: str, base_url: str = "http://192.168.0.49:1234", batch_size: int = 32
+        self, model: str, base_url: str = API_BASE, batch_size: int = 32
    ):
-        self.url = f"{base_url}/v1/embeddings"
+        self.url = f"{base_url}/{API_VERSION}/embeddings"
        self.model = model
        self.batch_size = batch_size
@@ -27,7 +31,7 @@ class LocalLMEmbeddings(Embeddings):
            return [[] for _ in input_texts]
    def embed_documents(self, texts: list[str]) -> list[list[float]]:
-        """Splits 500+ chunks into batches of 32 and processes them."""
+        """Splits chunks into batches of 32 and processes them."""
        all_embeddings = []
        for i in range(0, len(texts), self.batch_size):
@@ -1,21 +1,16 @@
 import dspy
 from typing import List
 from config_loader import load_config
 CFG = load_config()
 INGESTION_CONFIG = CFG["ingestion_agent"]
 class IngestionSignature(dspy.Signature):
-    """You are an expert Dungeon Master's assistant.
+    f"{INGESTION_CONFIG["ingestion_signature"]}"
    Analyze the provided notes and extract a concise synopsis and relevant metadata.
    synopsis = A one-sentence summary of the document.
    tags = Relevant tags (NPCs, Locations, Items, Plot Points).
    entities = Key names of people, places, or factions.
    "note -> synopsis:str, tags: list[str], entities: list[str]"
    /no_think
    """
    note: str = dspy.InputField(desc="The DM notes or session recap content.")
    answer: dict[str,str|List] = dspy.OutputField(desc="the metadata dictionary with the keys; synopsis, tags, entities")
 class IngestionAgent(dspy.Module):
    def __init__(self):
        self.ingest = dspy.Predict(IngestionSignature)
@@ -1,7 +1,7 @@
-# from pathlib import Path
+import os
 import turso
 import dspy
-#  from langchain_community.vectorstores import FAISS
+
 from config_loader import load_config
 from embedding import LocalLMEmbeddings
@@ -11,11 +11,10 @@ CFG = load_config()
 DATABASE_PATH = CFG["ingestion"]["db_path"]
 EMBEDDING_MODEL = CFG["models"]["embedding"]
 API_BASE = CFG["api"]["base_url"]
 RETRIEVAL_CONFIG = CFG["retrieval_agent"]
 # Inside your retrieval logic:
 def retrieve_from_turso(embedded_question, k=5):
    # Example query: search for relevant notes using full-text search or embedding similarity
    # Note: Turso supports SQLite, so you can use FTS5 or a vector extension if available
    query = f"""
    SELECT file_path, synopsis, tags, entities, chunk_data,
    vector_distance_cos(embedding, vector32('{embedded_question[0]}')) AS distance
@@ -31,8 +30,7 @@ def retrieve_from_turso(embedded_question, k=5):
 # --- DSPy Signature ---
 class DnDContextQA(dspy.Signature):
-    """Answer DnD campaign questions using provided details.
+    f"{RETRIEVAL_CONFIG["retrieval_signature"]}"
    """
    context = dspy.InputField(
        desc="Relevant chunks and metadata from the campaign notes."
@@ -49,7 +47,11 @@ class DnDRAG(dspy.Module):
            base_url=API_BASE,
            batch_size=1, # we only send 1 question at a time.
            )
-        self.generate_answer = dspy.ChainOfThought(DnDContextQA)
+                # Tools exposed to the ReAct loop
        self.tools = [
            self.load_file
        ]
        self.generate_answer = dspy.ReAct(signature=DnDContextQA,tools=self.tools)
    def forward(self, question):
        # Use Turso to retrieve relevant notes
@@ -74,11 +76,22 @@ entities: {entities}
 {content}
 """)
-        print('Closest embedding hits')
+        # print('Closest embedding hits')
-        for part in context_parts:
+        # for part in context_parts:
-            print(part)
+        #     print(part)
        context = "\n\n".join(context_parts)
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(answer=prediction.answer, context=context)
    def load_file(self, file_path) -> str | None:
        """Load and return specified file."""
        if os.path.exists(file_path):
            try:
                with open(file_path) as file:
                    return file.read()
            except Exception:
                return None
        else:
            return None
@@ -19,7 +19,9 @@ MODEL_BASE = CFG["models"]["enrich"]
 EMBEDDING_MODEL = CFG["models"]["embedding"]
 API_BASE = CFG["api"]["base_url"]
 API_VERSION = CFG["api"]["api_version"]
-MAX_WORKERS = CFG["ingestion"]["max_workers"]
+# MAX_WORKERS = CFG["ingestion"]["max_workers"]
 ACTIVE_LLMS = CFG["ingestion"]["active_llms"]
 PARALLEL_REQUESTS_PER_LLM = CFG["ingestion"]["parallel_requests_per_llm"]
 CHUNK_SIZE = CFG["ingestion"]["chunk_size"]
 CHUNK_OVERLAP = CFG["ingestion"]["chunk_overlap"]
 EMBEDDING_BATCH_SIZE = CFG["ingestion"]["embedding_batch_size"]
@@ -75,10 +77,10 @@ def chunk_documents(docs):
 def enrich_chunks(chunks: list) -> list:
    def process_single_chunk(indexed_chunk):
        idx, chunk = indexed_chunk
-        lm_index = idx % 8
+        lm_index = idx % ACTIVE_LLMS
        try:
-            with dspy.context(lm=dspy.LM(model=MODEL_BASE, api_base=API_BASE + API_VERSION)):
+            with dspy.context(lm=dspy.LM(model=f"{MODEL_BASE}{lm_index}", api_base=API_BASE + API_VERSION), chat_template_kwargs={"enable_thinking": False}):
                response = IngestionAgent().ingest(note=chunk.page_content)
                # This is now an object, not a string!
@@ -92,7 +94,7 @@ def enrich_chunks(chunks: list) -> list:
        return (idx, chunk)
    enriched_results = []
-    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+    with ThreadPoolExecutor(max_workers=PARALLEL_REQUESTS_PER_LLM*ACTIVE_LLMS) as executor:
        # Wrap chunks in enumerate to keep track of order
        futures = [executor.submit(process_single_chunk, (i, c)) for i, c in enumerate(chunks)]
@@ -1,16 +1,93 @@
 import sys
 import dspy
-# import turso
+import logging
 from dspy.utils.callback import BaseCallback
 from logging.handlers import RotatingFileHandler
 from config_loader import load_config
-from experts.dnd_agent import DnDRAG
+from experts.retrieval_agent import DnDRAG
 CFG = load_config()
 RETRIEVE_MODEL = CFG["models"]["retrieval"]
 API_BASE = CFG["api"]["base_url"]
 API_VERSION = CFG["api"]["api_version"]
 class CallbackHandler(BaseCallback):
    """Custom callback class for logging agent interactions."""
    def __init__(self, logger):
        """Initialize the callback with a logger instance."""
        super().__init__()
        self.logger = logger
    def on_module_end(self, call_id, outputs, exception):
        """Handle module end events for logging."""
        step = "Reasoning" if self._is_reasoning_output(outputs) else "Acting"
        self.logger.debug(f"== {step} Step ===")
        for k, v in outputs.items():
            self.logger.debug(f"  {k}: {v}")
    def on_lm_start(self, call_id, instance, inputs):
        """Handle language model start events for logging."""
        self.logger.debug(f"LM is called with inputs: {inputs}")
    def on_tool_start(self, call_id, instance, inputs):
        """Handle tool start events for logging."""
        self.logger.debug(f"Tool {instance} called with inputs: {inputs}")
    def on_tool_end(self, call_id, outputs, exception):
        """Handle tool end events for logging."""
        self.logger.debug(f"Tool finished with outputs: {outputs}")
    def on_lm_end(self, call_id, outputs, exception):
        """Handle language model end events for logging."""
        self.logger.debug(f"LM is finished with outputs: {outputs}")
    def _is_reasoning_output(self, outputs):
        return any(k.startswith("Thought") for k in outputs)
 def setup_logging():
    """Set up logging configuration for Merlin."""
    # Create a custom logger
    logger = logging.getLogger(__name__)
    # Set the minimum level for the logger
    logger.setLevel(logging.DEBUG)
    # Create a console handler
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    # Create a file handler with rotation every 5MB
    file_handler = RotatingFileHandler(
        "dmv.log", maxBytes=5 * 1024 * 1024, backupCount=3
    )
    file_handler.setLevel(logging.DEBUG)
    # Create a formatter
    formatter = logging.Formatter(
        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
    )
    # Set the formatter for the handler
    console_handler.setFormatter(formatter)
    file_handler.setFormatter(formatter)
    # Add the handler to the logger
    logger.addHandler(console_handler)
    logger.addHandler(file_handler)
    return logger
 def main():
    logger = setup_logging()
    logger.debug("main application started")
    # Add verbose callback
    dspy.configure(verbose_errors=True)
    dspy.configure(callbacks=[CallbackHandler(logger)])
    # 1. Setup the LLM
    print("🚀 Initializing Qwen-8B via LM Studio...")
    lm = dspy.LM(RETRIEVE_MODEL, api_base=API_BASE + API_VERSION)
@@ -32,7 +109,7 @@ def main():
            query = input("📝 Query: ").strip()
            # Exit conditions
-            if query.lower() in ["exit", "quit", "q"]:
+            if query.lower() in ["exit", "quit", "q", "bye"]:
                print("Farewell, traveler. Good luck on your quest!")
                break
@@ -47,11 +124,10 @@ def main():
            print(response.answer)
        except KeyboardInterrupt:
-            print("\n\nExiting... See you next session!")
+            print("\n\nRude?!.... Exiting...")
            sys.exit(0)
        except Exception as e:
            print(f"\n⚠️ An error occurred: {e}")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,25 @@
 import dspy
 base_url= "http://framework.tawny-bellatrix.ts.net:1234"
 model_name= "lm_studio/qwen-0"
 lm = dspy.LM(
    model=model_name, 
    api_base=f"{base_url}/v1/"
 )
 dspy.configure(lm=lm)
 # question = "How can i use dspy framework to add 'chat_template_kwargs={\"enable_thinking\": False}' to my API call to LM Studio? i know it uses litellm under the hood"
 # question = "Hi there, do you have a name? if not i want you to name yourself."
 question = "how long would it take light to travel from the sun to the earth? /no_think"
 # Call with request_kwargs to inject the template kwargs
 response = lm(
    messages=[{"role": "user", "content": question}]
    # extra_body={"enable_thinking": False} 
    # enable_thinking=False
 )
 print(response)