feat: AI Read File Tool, Configurable system prompts and loading lots of llms

This commit is contained in:
2026-03-04 15:48:25 +00:00
parent bbaebf1f70
commit 0d0e747682
10 changed files with 184 additions and 47 deletions
+1
View File
@@ -1,4 +1,5 @@
data/* data/*
*.log
# Python-generated files # Python-generated files
__pycache__/ __pycache__/
+4 -6
View File
@@ -1,8 +1,10 @@
Read File Tool for Retrieve Agent ---Read File Tool for Retrieve Agent---
Easy Config of system prompts ---Easy Config of system prompts---
examples into prompts & better prompts examples into prompts & better prompts
LMS CLI script to load multiple models and to make each model accept multiple inferences
context engineering, - only include vector hits that are x distance? context engineering, - only include vector hits that are x distance?
AI in the middle - make the ai generate the string for vector search AI in the middle - make the ai generate the string for vector search
@@ -19,7 +21,3 @@ QA specific embedding models?
Evaluation metrics, how good is it doing? Evaluation metrics, how good is it doing?
rate my response!? rate my response!?
examples into prompts & better prompts
common model attributes - temp & top-k
+10
View File
@@ -0,0 +1,10 @@
lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-0" --ttl 1800
lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-1" --ttl 1800
lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-2" --ttl 1800
lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-3" --ttl 1800
lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-4" --ttl 1800
lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-5" --ttl 1800
lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-6" --ttl 1800
lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-7" --ttl 1800
lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-8" --ttl 1800
lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-9" --ttl 1800
+19 -6
View File
@@ -5,7 +5,7 @@ api:
# --- Model Settings --- # --- Model Settings ---
models: models:
enrich: "lm_studio/qwen/qwen3-8b" enrich: "lm_studio/qwen-"
embedding: "text-embedding-qwen3-embedding-8b" embedding: "text-embedding-qwen3-embedding-8b"
retrieval: "lm_studio/qwen/qwen3-30b-a3b-2507" retrieval: "lm_studio/qwen/qwen3-30b-a3b-2507"
@@ -13,13 +13,26 @@ models:
ingestion: ingestion:
data_dir: "/home/devin/DnD" data_dir: "/home/devin/DnD"
db_path: "./data/dmv.db" db_path: "./data/dmv.db"
max_workers: 8 active_llms: 10
parallel_requests_per_llm: 4
chunk_size: 800 chunk_size: 800
chunk_overlap: 100 chunk_overlap: 100
embedding_batch_size: 32 embedding_batch_size: 32
time_file_location: "./data/time_file.txt" time_file_location: "./data/time_file.txt"
# --- Retrieval Settings --- # ---- Agent Settings ----
retrieval: ingestion_agent:
top_k: 4 ingestion_signature: |
context_limit: 10000 # Max characters from full file context You are an expert Dungeon Master's assistant.
Analyze the provided notes and extract a concise synopsis and relevant metadata.
synopsis = A one-sentence summary of the document.
tags = Relevant tags (NPCs, Locations, Items, Plot Points).
entities = a list of Key names of people, places, or factions.
"note -> synopsis:str, tags: list[str], entities: list[str]"
retrieval_agent:
retrieval_signature: |
You are an expert Dungeon Master's assistant.
Given the context and the question, answer the question.
Do not make things up, base all of your answers on the context.
Always site your sources
+7 -3
View File
@@ -1,12 +1,16 @@
import requests import requests
from langchain_core.embeddings import Embeddings from langchain_core.embeddings import Embeddings
from config_loader import load_config
CFG = load_config()
API_BASE = CFG["api"]["base_url"]
API_VERSION = CFG["api"]["api_version"]
class LocalLMEmbeddings(Embeddings): class LocalLMEmbeddings(Embeddings):
def __init__( def __init__(
self, model: str, base_url: str = "http://192.168.0.49:1234", batch_size: int = 32 self, model: str, base_url: str = API_BASE, batch_size: int = 32
): ):
self.url = f"{base_url}/v1/embeddings" self.url = f"{base_url}/{API_VERSION}/embeddings"
self.model = model self.model = model
self.batch_size = batch_size self.batch_size = batch_size
@@ -27,7 +31,7 @@ class LocalLMEmbeddings(Embeddings):
return [[] for _ in input_texts] return [[] for _ in input_texts]
def embed_documents(self, texts: list[str]) -> list[list[float]]: def embed_documents(self, texts: list[str]) -> list[list[float]]:
"""Splits 500+ chunks into batches of 32 and processes them.""" """Splits chunks into batches of 32 and processes them."""
all_embeddings = [] all_embeddings = []
for i in range(0, len(texts), self.batch_size): for i in range(0, len(texts), self.batch_size):
+5 -10
View File
@@ -1,21 +1,16 @@
import dspy import dspy
from typing import List from typing import List
from config_loader import load_config
CFG = load_config()
INGESTION_CONFIG = CFG["ingestion_agent"]
class IngestionSignature(dspy.Signature): class IngestionSignature(dspy.Signature):
"""You are an expert Dungeon Master's assistant. f"{INGESTION_CONFIG["ingestion_signature"]}"
Analyze the provided notes and extract a concise synopsis and relevant metadata.
synopsis = A one-sentence summary of the document.
tags = Relevant tags (NPCs, Locations, Items, Plot Points).
entities = Key names of people, places, or factions.
"note -> synopsis:str, tags: list[str], entities: list[str]"
/no_think
"""
note: str = dspy.InputField(desc="The DM notes or session recap content.") note: str = dspy.InputField(desc="The DM notes or session recap content.")
answer: dict[str,str|List] = dspy.OutputField(desc="the metadata dictionary with the keys; synopsis, tags, entities") answer: dict[str,str|List] = dspy.OutputField(desc="the metadata dictionary with the keys; synopsis, tags, entities")
class IngestionAgent(dspy.Module): class IngestionAgent(dspy.Module):
def __init__(self): def __init__(self):
self.ingest = dspy.Predict(IngestionSignature) self.ingest = dspy.Predict(IngestionSignature)
@@ -1,7 +1,7 @@
# from pathlib import Path import os
import turso import turso
import dspy import dspy
# from langchain_community.vectorstores import FAISS
from config_loader import load_config from config_loader import load_config
from embedding import LocalLMEmbeddings from embedding import LocalLMEmbeddings
@@ -11,11 +11,10 @@ CFG = load_config()
DATABASE_PATH = CFG["ingestion"]["db_path"] DATABASE_PATH = CFG["ingestion"]["db_path"]
EMBEDDING_MODEL = CFG["models"]["embedding"] EMBEDDING_MODEL = CFG["models"]["embedding"]
API_BASE = CFG["api"]["base_url"] API_BASE = CFG["api"]["base_url"]
RETRIEVAL_CONFIG = CFG["retrieval_agent"]
# Inside your retrieval logic:
def retrieve_from_turso(embedded_question, k=5): def retrieve_from_turso(embedded_question, k=5):
# Example query: search for relevant notes using full-text search or embedding similarity
# Note: Turso supports SQLite, so you can use FTS5 or a vector extension if available
query = f""" query = f"""
SELECT file_path, synopsis, tags, entities, chunk_data, SELECT file_path, synopsis, tags, entities, chunk_data,
vector_distance_cos(embedding, vector32('{embedded_question[0]}')) AS distance vector_distance_cos(embedding, vector32('{embedded_question[0]}')) AS distance
@@ -31,8 +30,7 @@ def retrieve_from_turso(embedded_question, k=5):
# --- DSPy Signature --- # --- DSPy Signature ---
class DnDContextQA(dspy.Signature): class DnDContextQA(dspy.Signature):
"""Answer DnD campaign questions using provided details. f"{RETRIEVAL_CONFIG["retrieval_signature"]}"
"""
context = dspy.InputField( context = dspy.InputField(
desc="Relevant chunks and metadata from the campaign notes." desc="Relevant chunks and metadata from the campaign notes."
@@ -49,7 +47,11 @@ class DnDRAG(dspy.Module):
base_url=API_BASE, base_url=API_BASE,
batch_size=1, # we only send 1 question at a time. batch_size=1, # we only send 1 question at a time.
) )
self.generate_answer = dspy.ChainOfThought(DnDContextQA) # Tools exposed to the ReAct loop
self.tools = [
self.load_file
]
self.generate_answer = dspy.ReAct(signature=DnDContextQA,tools=self.tools)
def forward(self, question): def forward(self, question):
# Use Turso to retrieve relevant notes # Use Turso to retrieve relevant notes
@@ -74,11 +76,22 @@ entities: {entities}
{content} {content}
""") """)
print('Closest embedding hits') # print('Closest embedding hits')
for part in context_parts: # for part in context_parts:
print(part) # print(part)
context = "\n\n".join(context_parts) context = "\n\n".join(context_parts)
prediction = self.generate_answer(context=context, question=question) prediction = self.generate_answer(context=context, question=question)
return dspy.Prediction(answer=prediction.answer, context=context) return dspy.Prediction(answer=prediction.answer, context=context)
def load_file(self, file_path) -> str | None:
"""Load and return specified file."""
if os.path.exists(file_path):
try:
with open(file_path) as file:
return file.read()
except Exception:
return None
else:
return None
+6 -4
View File
@@ -19,7 +19,9 @@ MODEL_BASE = CFG["models"]["enrich"]
EMBEDDING_MODEL = CFG["models"]["embedding"] EMBEDDING_MODEL = CFG["models"]["embedding"]
API_BASE = CFG["api"]["base_url"] API_BASE = CFG["api"]["base_url"]
API_VERSION = CFG["api"]["api_version"] API_VERSION = CFG["api"]["api_version"]
MAX_WORKERS = CFG["ingestion"]["max_workers"] # MAX_WORKERS = CFG["ingestion"]["max_workers"]
ACTIVE_LLMS = CFG["ingestion"]["active_llms"]
PARALLEL_REQUESTS_PER_LLM = CFG["ingestion"]["parallel_requests_per_llm"]
CHUNK_SIZE = CFG["ingestion"]["chunk_size"] CHUNK_SIZE = CFG["ingestion"]["chunk_size"]
CHUNK_OVERLAP = CFG["ingestion"]["chunk_overlap"] CHUNK_OVERLAP = CFG["ingestion"]["chunk_overlap"]
EMBEDDING_BATCH_SIZE = CFG["ingestion"]["embedding_batch_size"] EMBEDDING_BATCH_SIZE = CFG["ingestion"]["embedding_batch_size"]
@@ -75,10 +77,10 @@ def chunk_documents(docs):
def enrich_chunks(chunks: list) -> list: def enrich_chunks(chunks: list) -> list:
def process_single_chunk(indexed_chunk): def process_single_chunk(indexed_chunk):
idx, chunk = indexed_chunk idx, chunk = indexed_chunk
lm_index = idx % 8 lm_index = idx % ACTIVE_LLMS
try: try:
with dspy.context(lm=dspy.LM(model=MODEL_BASE, api_base=API_BASE + API_VERSION)): with dspy.context(lm=dspy.LM(model=f"{MODEL_BASE}{lm_index}", api_base=API_BASE + API_VERSION), chat_template_kwargs={"enable_thinking": False}):
response = IngestionAgent().ingest(note=chunk.page_content) response = IngestionAgent().ingest(note=chunk.page_content)
# This is now an object, not a string! # This is now an object, not a string!
@@ -92,7 +94,7 @@ def enrich_chunks(chunks: list) -> list:
return (idx, chunk) return (idx, chunk)
enriched_results = [] enriched_results = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: with ThreadPoolExecutor(max_workers=PARALLEL_REQUESTS_PER_LLM*ACTIVE_LLMS) as executor:
# Wrap chunks in enumerate to keep track of order # Wrap chunks in enumerate to keep track of order
futures = [executor.submit(process_single_chunk, (i, c)) for i, c in enumerate(chunks)] futures = [executor.submit(process_single_chunk, (i, c)) for i, c in enumerate(chunks)]
+81 -5
View File
@@ -1,16 +1,93 @@
import sys import sys
import dspy import dspy
# import turso import logging
from dspy.utils.callback import BaseCallback
from logging.handlers import RotatingFileHandler
from config_loader import load_config from config_loader import load_config
from experts.dnd_agent import DnDRAG from experts.retrieval_agent import DnDRAG
CFG = load_config() CFG = load_config()
RETRIEVE_MODEL = CFG["models"]["retrieval"] RETRIEVE_MODEL = CFG["models"]["retrieval"]
API_BASE = CFG["api"]["base_url"] API_BASE = CFG["api"]["base_url"]
API_VERSION = CFG["api"]["api_version"] API_VERSION = CFG["api"]["api_version"]
class CallbackHandler(BaseCallback):
"""Custom callback class for logging agent interactions."""
def __init__(self, logger):
"""Initialize the callback with a logger instance."""
super().__init__()
self.logger = logger
def on_module_end(self, call_id, outputs, exception):
"""Handle module end events for logging."""
step = "Reasoning" if self._is_reasoning_output(outputs) else "Acting"
self.logger.debug(f"== {step} Step ===")
for k, v in outputs.items():
self.logger.debug(f" {k}: {v}")
def on_lm_start(self, call_id, instance, inputs):
"""Handle language model start events for logging."""
self.logger.debug(f"LM is called with inputs: {inputs}")
def on_tool_start(self, call_id, instance, inputs):
"""Handle tool start events for logging."""
self.logger.debug(f"Tool {instance} called with inputs: {inputs}")
def on_tool_end(self, call_id, outputs, exception):
"""Handle tool end events for logging."""
self.logger.debug(f"Tool finished with outputs: {outputs}")
def on_lm_end(self, call_id, outputs, exception):
"""Handle language model end events for logging."""
self.logger.debug(f"LM is finished with outputs: {outputs}")
def _is_reasoning_output(self, outputs):
return any(k.startswith("Thought") for k in outputs)
def setup_logging():
"""Set up logging configuration for Merlin."""
# Create a custom logger
logger = logging.getLogger(__name__)
# Set the minimum level for the logger
logger.setLevel(logging.DEBUG)
# Create a console handler
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
# Create a file handler with rotation every 5MB
file_handler = RotatingFileHandler(
"dmv.log", maxBytes=5 * 1024 * 1024, backupCount=3
)
file_handler.setLevel(logging.DEBUG)
# Create a formatter
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
# Set the formatter for the handler
console_handler.setFormatter(formatter)
file_handler.setFormatter(formatter)
# Add the handler to the logger
logger.addHandler(console_handler)
logger.addHandler(file_handler)
return logger
def main(): def main():
logger = setup_logging()
logger.debug("main application started")
# Add verbose callback
dspy.configure(verbose_errors=True)
dspy.configure(callbacks=[CallbackHandler(logger)])
# 1. Setup the LLM # 1. Setup the LLM
print("🚀 Initializing Qwen-8B via LM Studio...") print("🚀 Initializing Qwen-8B via LM Studio...")
lm = dspy.LM(RETRIEVE_MODEL, api_base=API_BASE + API_VERSION) lm = dspy.LM(RETRIEVE_MODEL, api_base=API_BASE + API_VERSION)
@@ -32,7 +109,7 @@ def main():
query = input("📝 Query: ").strip() query = input("📝 Query: ").strip()
# Exit conditions # Exit conditions
if query.lower() in ["exit", "quit", "q"]: if query.lower() in ["exit", "quit", "q", "bye"]:
print("Farewell, traveler. Good luck on your quest!") print("Farewell, traveler. Good luck on your quest!")
break break
@@ -47,11 +124,10 @@ def main():
print(response.answer) print(response.answer)
except KeyboardInterrupt: except KeyboardInterrupt:
print("\n\nExiting... See you next session!") print("\n\nRude?!.... Exiting...")
sys.exit(0) sys.exit(0)
except Exception as e: except Exception as e:
print(f"\n⚠️ An error occurred: {e}") print(f"\n⚠️ An error occurred: {e}")
if __name__ == "__main__": if __name__ == "__main__":
main() main()
+25
View File
@@ -0,0 +1,25 @@
import dspy
base_url= "http://framework.tawny-bellatrix.ts.net:1234"
model_name= "lm_studio/qwen-0"
lm = dspy.LM(
model=model_name,
api_base=f"{base_url}/v1/"
)
dspy.configure(lm=lm)
# question = "How can i use dspy framework to add 'chat_template_kwargs={\"enable_thinking\": False}' to my API call to LM Studio? i know it uses litellm under the hood"
# question = "Hi there, do you have a name? if not i want you to name yourself."
question = "how long would it take light to travel from the sun to the earth? /no_think"
# Call with request_kwargs to inject the template kwargs
response = lm(
messages=[{"role": "user", "content": question}]
# extra_body={"enable_thinking": False}
# enable_thinking=False
)
print(response)