diff --git a/.gitignore b/.gitignore index 3d689fb..9d9fc24 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ data/* +*.log # Python-generated files __pycache__/ diff --git a/TODO b/TODO index de347e3..e2429c2 100644 --- a/TODO +++ b/TODO @@ -1,8 +1,10 @@ -Read File Tool for Retrieve Agent +---Read File Tool for Retrieve Agent--- -Easy Config of system prompts +---Easy Config of system prompts--- examples into prompts & better prompts +LMS CLI script to load multiple models and to make each model accept multiple inferences + context engineering, - only include vector hits that are x distance? AI in the middle - make the ai generate the string for vector search @@ -19,7 +21,3 @@ QA specific embedding models? Evaluation metrics, how good is it doing? rate my response!? -examples into prompts & better prompts - -common model attributes - temp & top-k - diff --git a/load_ingestion_llms.sh b/load_ingestion_llms.sh new file mode 100755 index 0000000..82f40eb --- /dev/null +++ b/load_ingestion_llms.sh @@ -0,0 +1,10 @@ +lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-0" --ttl 1800 +lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-1" --ttl 1800 +lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-2" --ttl 1800 +lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-3" --ttl 1800 +lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-4" --ttl 1800 +lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-5" --ttl 1800 +lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-6" --ttl 1800 +lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-7" --ttl 1800 +lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-8" --ttl 1800 +lms load qwen/qwen3.5-4b --parallel 4 --identifier "qwen-9" --ttl 1800 \ No newline at end of file diff --git a/src/config.yaml b/src/config.yaml index 377153a..71d9146 100644 --- a/src/config.yaml +++ b/src/config.yaml @@ -5,7 +5,7 @@ api: # --- Model Settings --- models: - enrich: "lm_studio/qwen/qwen3-8b" + enrich: "lm_studio/qwen-" embedding: "text-embedding-qwen3-embedding-8b" retrieval: "lm_studio/qwen/qwen3-30b-a3b-2507" @@ -13,13 +13,26 @@ models: ingestion: data_dir: "/home/devin/DnD" db_path: "./data/dmv.db" - max_workers: 8 + active_llms: 10 + parallel_requests_per_llm: 4 chunk_size: 800 chunk_overlap: 100 embedding_batch_size: 32 time_file_location: "./data/time_file.txt" -# --- Retrieval Settings --- -retrieval: - top_k: 4 - context_limit: 10000 # Max characters from full file context \ No newline at end of file +# ---- Agent Settings ---- +ingestion_agent: + ingestion_signature: | + You are an expert Dungeon Master's assistant. + Analyze the provided notes and extract a concise synopsis and relevant metadata. + synopsis = A one-sentence summary of the document. + tags = Relevant tags (NPCs, Locations, Items, Plot Points). + entities = a list of Key names of people, places, or factions. + "note -> synopsis:str, tags: list[str], entities: list[str]" + +retrieval_agent: + retrieval_signature: | + You are an expert Dungeon Master's assistant. + Given the context and the question, answer the question. + Do not make things up, base all of your answers on the context. + Always site your sources diff --git a/src/embedding.py b/src/embedding.py index d7f0a52..8faac55 100644 --- a/src/embedding.py +++ b/src/embedding.py @@ -1,12 +1,16 @@ import requests from langchain_core.embeddings import Embeddings +from config_loader import load_config +CFG = load_config() +API_BASE = CFG["api"]["base_url"] +API_VERSION = CFG["api"]["api_version"] class LocalLMEmbeddings(Embeddings): def __init__( - self, model: str, base_url: str = "http://192.168.0.49:1234", batch_size: int = 32 + self, model: str, base_url: str = API_BASE, batch_size: int = 32 ): - self.url = f"{base_url}/v1/embeddings" + self.url = f"{base_url}/{API_VERSION}/embeddings" self.model = model self.batch_size = batch_size @@ -27,7 +31,7 @@ class LocalLMEmbeddings(Embeddings): return [[] for _ in input_texts] def embed_documents(self, texts: list[str]) -> list[list[float]]: - """Splits 500+ chunks into batches of 32 and processes them.""" + """Splits chunks into batches of 32 and processes them.""" all_embeddings = [] for i in range(0, len(texts), self.batch_size): diff --git a/src/experts/ingestion_agent.py b/src/experts/ingestion_agent.py index 960f74f..b71b541 100644 --- a/src/experts/ingestion_agent.py +++ b/src/experts/ingestion_agent.py @@ -1,21 +1,16 @@ import dspy from typing import List +from config_loader import load_config + +CFG = load_config() +INGESTION_CONFIG = CFG["ingestion_agent"] class IngestionSignature(dspy.Signature): - """You are an expert Dungeon Master's assistant. - Analyze the provided notes and extract a concise synopsis and relevant metadata. - synopsis = A one-sentence summary of the document. - tags = Relevant tags (NPCs, Locations, Items, Plot Points). - entities = Key names of people, places, or factions. - "note -> synopsis:str, tags: list[str], entities: list[str]" - /no_think - """ + f"{INGESTION_CONFIG["ingestion_signature"]}" note: str = dspy.InputField(desc="The DM notes or session recap content.") answer: dict[str,str|List] = dspy.OutputField(desc="the metadata dictionary with the keys; synopsis, tags, entities") - - class IngestionAgent(dspy.Module): def __init__(self): self.ingest = dspy.Predict(IngestionSignature) diff --git a/src/experts/dnd_agent.py b/src/experts/retrieval_agent.py similarity index 74% rename from src/experts/dnd_agent.py rename to src/experts/retrieval_agent.py index 63cd55a..7282417 100644 --- a/src/experts/dnd_agent.py +++ b/src/experts/retrieval_agent.py @@ -1,7 +1,7 @@ -# from pathlib import Path +import os import turso import dspy -# from langchain_community.vectorstores import FAISS + from config_loader import load_config from embedding import LocalLMEmbeddings @@ -11,11 +11,10 @@ CFG = load_config() DATABASE_PATH = CFG["ingestion"]["db_path"] EMBEDDING_MODEL = CFG["models"]["embedding"] API_BASE = CFG["api"]["base_url"] +RETRIEVAL_CONFIG = CFG["retrieval_agent"] + -# Inside your retrieval logic: def retrieve_from_turso(embedded_question, k=5): - # Example query: search for relevant notes using full-text search or embedding similarity - # Note: Turso supports SQLite, so you can use FTS5 or a vector extension if available query = f""" SELECT file_path, synopsis, tags, entities, chunk_data, vector_distance_cos(embedding, vector32('{embedded_question[0]}')) AS distance @@ -31,8 +30,7 @@ def retrieve_from_turso(embedded_question, k=5): # --- DSPy Signature --- class DnDContextQA(dspy.Signature): - """Answer DnD campaign questions using provided details. - """ + f"{RETRIEVAL_CONFIG["retrieval_signature"]}" context = dspy.InputField( desc="Relevant chunks and metadata from the campaign notes." @@ -49,7 +47,11 @@ class DnDRAG(dspy.Module): base_url=API_BASE, batch_size=1, # we only send 1 question at a time. ) - self.generate_answer = dspy.ChainOfThought(DnDContextQA) + # Tools exposed to the ReAct loop + self.tools = [ + self.load_file + ] + self.generate_answer = dspy.ReAct(signature=DnDContextQA,tools=self.tools) def forward(self, question): # Use Turso to retrieve relevant notes @@ -73,12 +75,23 @@ tags: {tags}, entities: {entities} {content} """) - - print('Closest embedding hits') - for part in context_parts: - print(part) + + # print('Closest embedding hits') + # for part in context_parts: + # print(part) context = "\n\n".join(context_parts) prediction = self.generate_answer(context=context, question=question) return dspy.Prediction(answer=prediction.answer, context=context) + + def load_file(self, file_path) -> str | None: + """Load and return specified file.""" + if os.path.exists(file_path): + try: + with open(file_path) as file: + return file.read() + except Exception: + return None + else: + return None \ No newline at end of file diff --git a/src/ingest.py b/src/ingest.py index 9188013..c50f4f5 100644 --- a/src/ingest.py +++ b/src/ingest.py @@ -19,7 +19,9 @@ MODEL_BASE = CFG["models"]["enrich"] EMBEDDING_MODEL = CFG["models"]["embedding"] API_BASE = CFG["api"]["base_url"] API_VERSION = CFG["api"]["api_version"] -MAX_WORKERS = CFG["ingestion"]["max_workers"] +# MAX_WORKERS = CFG["ingestion"]["max_workers"] +ACTIVE_LLMS = CFG["ingestion"]["active_llms"] +PARALLEL_REQUESTS_PER_LLM = CFG["ingestion"]["parallel_requests_per_llm"] CHUNK_SIZE = CFG["ingestion"]["chunk_size"] CHUNK_OVERLAP = CFG["ingestion"]["chunk_overlap"] EMBEDDING_BATCH_SIZE = CFG["ingestion"]["embedding_batch_size"] @@ -75,10 +77,10 @@ def chunk_documents(docs): def enrich_chunks(chunks: list) -> list: def process_single_chunk(indexed_chunk): idx, chunk = indexed_chunk - lm_index = idx % 8 + lm_index = idx % ACTIVE_LLMS try: - with dspy.context(lm=dspy.LM(model=MODEL_BASE, api_base=API_BASE + API_VERSION)): + with dspy.context(lm=dspy.LM(model=f"{MODEL_BASE}{lm_index}", api_base=API_BASE + API_VERSION), chat_template_kwargs={"enable_thinking": False}): response = IngestionAgent().ingest(note=chunk.page_content) # This is now an object, not a string! @@ -92,7 +94,7 @@ def enrich_chunks(chunks: list) -> list: return (idx, chunk) enriched_results = [] - with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: + with ThreadPoolExecutor(max_workers=PARALLEL_REQUESTS_PER_LLM*ACTIVE_LLMS) as executor: # Wrap chunks in enumerate to keep track of order futures = [executor.submit(process_single_chunk, (i, c)) for i, c in enumerate(chunks)] diff --git a/src/retrieve.py b/src/retrieve.py index f7cc2af..cd4e00f 100644 --- a/src/retrieve.py +++ b/src/retrieve.py @@ -1,16 +1,93 @@ import sys import dspy -# import turso +import logging +from dspy.utils.callback import BaseCallback + +from logging.handlers import RotatingFileHandler from config_loader import load_config -from experts.dnd_agent import DnDRAG +from experts.retrieval_agent import DnDRAG CFG = load_config() RETRIEVE_MODEL = CFG["models"]["retrieval"] API_BASE = CFG["api"]["base_url"] API_VERSION = CFG["api"]["api_version"] +class CallbackHandler(BaseCallback): + """Custom callback class for logging agent interactions.""" + + def __init__(self, logger): + """Initialize the callback with a logger instance.""" + super().__init__() + self.logger = logger + + def on_module_end(self, call_id, outputs, exception): + """Handle module end events for logging.""" + step = "Reasoning" if self._is_reasoning_output(outputs) else "Acting" + self.logger.debug(f"== {step} Step ===") + for k, v in outputs.items(): + self.logger.debug(f" {k}: {v}") + + def on_lm_start(self, call_id, instance, inputs): + """Handle language model start events for logging.""" + self.logger.debug(f"LM is called with inputs: {inputs}") + + def on_tool_start(self, call_id, instance, inputs): + """Handle tool start events for logging.""" + self.logger.debug(f"Tool {instance} called with inputs: {inputs}") + + def on_tool_end(self, call_id, outputs, exception): + """Handle tool end events for logging.""" + self.logger.debug(f"Tool finished with outputs: {outputs}") + + def on_lm_end(self, call_id, outputs, exception): + """Handle language model end events for logging.""" + self.logger.debug(f"LM is finished with outputs: {outputs}") + + def _is_reasoning_output(self, outputs): + return any(k.startswith("Thought") for k in outputs) + +def setup_logging(): + """Set up logging configuration for Merlin.""" + # Create a custom logger + logger = logging.getLogger(__name__) + + # Set the minimum level for the logger + logger.setLevel(logging.DEBUG) + + # Create a console handler + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.INFO) + + # Create a file handler with rotation every 5MB + file_handler = RotatingFileHandler( + "dmv.log", maxBytes=5 * 1024 * 1024, backupCount=3 + ) + file_handler.setLevel(logging.DEBUG) + + # Create a formatter + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + + # Set the formatter for the handler + console_handler.setFormatter(formatter) + file_handler.setFormatter(formatter) + + # Add the handler to the logger + logger.addHandler(console_handler) + logger.addHandler(file_handler) + + return logger + + def main(): + logger = setup_logging() + logger.debug("main application started") + + # Add verbose callback + dspy.configure(verbose_errors=True) + dspy.configure(callbacks=[CallbackHandler(logger)]) # 1. Setup the LLM print("šŸš€ Initializing Qwen-8B via LM Studio...") lm = dspy.LM(RETRIEVE_MODEL, api_base=API_BASE + API_VERSION) @@ -32,7 +109,7 @@ def main(): query = input("šŸ“ Query: ").strip() # Exit conditions - if query.lower() in ["exit", "quit", "q"]: + if query.lower() in ["exit", "quit", "q", "bye"]: print("Farewell, traveler. Good luck on your quest!") break @@ -47,11 +124,10 @@ def main(): print(response.answer) except KeyboardInterrupt: - print("\n\nExiting... See you next session!") + print("\n\nRude?!.... Exiting...") sys.exit(0) except Exception as e: print(f"\nāš ļø An error occurred: {e}") - if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/test_no_think.py b/test_no_think.py new file mode 100644 index 0000000..98f824e --- /dev/null +++ b/test_no_think.py @@ -0,0 +1,25 @@ +import dspy + +base_url= "http://framework.tawny-bellatrix.ts.net:1234" +model_name= "lm_studio/qwen-0" + + +lm = dspy.LM( + model=model_name, + api_base=f"{base_url}/v1/" +) +dspy.configure(lm=lm) + + +# question = "How can i use dspy framework to add 'chat_template_kwargs={\"enable_thinking\": False}' to my API call to LM Studio? i know it uses litellm under the hood" +# question = "Hi there, do you have a name? if not i want you to name yourself." +question = "how long would it take light to travel from the sun to the earth? /no_think" + +# Call with request_kwargs to inject the template kwargs +response = lm( + messages=[{"role": "user", "content": question}] + # extra_body={"enable_thinking": False} + # enable_thinking=False +) + +print(response) \ No newline at end of file