chore: 🧹 removing clutter

2026-01-27 22:04:31 +00:00
parent 4296a4df88
commit d5f8d72e46
12 changed files with 235 additions and 387 deletions
@@ -10,8 +10,11 @@ from tqdm import tqdm

 from embedding import LocalLMEmbeddings
 from experts.ingestion_agent import IngestionAgent
+from config_loader import load_config

-DATA_DIR = "/home/cosmic/DnD"
+
+CFG = load_config()
+DATA_DIR = CFG["ingestion"]["data_dir"]

 def load_documents():
    docs = []
@@ -41,47 +44,38 @@ def load_documents():
 def chunk_documents(docs):
    # LangChain preserves metadata during splitting automatically
    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=800, 
-        chunk_overlap=100, 
+        chunk_size=CFG["ingestion"]["chunk_size"], 
+        chunk_overlap=CFG["ingestion"]["chunk_overlap"], 
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    return text_splitter.split_documents(docs)

 def enrich_chunks(chunks: list) -> list:
-    MODEL_BASE = "lm_studio/qwen/qwen3-8b"
-    API_BASE = "http://192.168.0.49:1234/v1/"
+    MODEL_BASE = CFG["models"]["inference"]
+    API_BASE = CFG["api"]["base_url"]
+    API_VERSION = CFG["api"]["api_version"]
    
    def process_single_chunk(indexed_chunk):
        idx, chunk = indexed_chunk
        lm_index = idx % 8
        
        try:
-            # Configure context for this specific thread
-            with dspy.context(lm=dspy.LM(f"{MODEL_BASE}:{lm_index}", api_base=API_BASE)):
-                # Pass the text, but we will update the original chunk object
-                response = IngestionAgent().ingest(note=chunk.page_content) 
+            with dspy.context(lm=dspy.LM(f"{MODEL_BASE}:{lm_index}", api_base=API_BASE+API_VERSION)):
+                response = IngestionAgent().forward(note=chunk.page_content)
                
-                answer = response.answer
-                start = answer.find("{")
-                end = answer.rfind("}") + 1
-                metadata_extracted = json.loads(answer[start:end])
-                
-                # UPDATE: Put AI data in a sub-key to avoid overwriting 'source'
-                chunk.metadata["enrichment"] = metadata_extracted
-                # Also flatten tags for easier searching if needed
-                if "tags" in metadata_extracted:
-                    chunk.metadata["tags"] = metadata_extracted["tags"]
+                # This is now an object, not a string!
+                metadata = response.answer.dict() 

        except Exception as e:
-            # If enrichment fails, we KEEP the chunk but flag the error
-            # This ensures 'source' and 'full_path' are NEVER lost
-            chunk.metadata["enrichment_error"] = str(e)
-            chunk.metadata["tags"] = ["error"]
+            print(f"⚠️ Failed for chunk {idx}: {e}")
+            metadata = {"synopsis": "Summary failed", "tags": ["error"], "entities": []}

-        return idx, chunk
+            chunk.metadata.update(metadata)
+            return chunk
+            

    enriched_results = []
-    with ThreadPoolExecutor(max_workers=8) as executor:
+    with ThreadPoolExecutor(max_workers=CFG["ingestion"]["max_workers"]) as executor:
        # Wrap chunks in enumerate to keep track of order
        futures = [executor.submit(process_single_chunk, (i, c)) for i, c in enumerate(chunks)]