ruff check and format

2026-01-28 12:59:43 +00:00
parent 8ca23187e3
commit 6b9eecf24c
6 changed files with 65 additions and 77 deletions
@@ -1,4 +1,3 @@
-import json
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path

@@ -8,10 +7,9 @@ from langchain_community.vectorstores import FAISS
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from tqdm import tqdm

+from config_loader import load_config
 from embedding import LocalLMEmbeddings
 from experts.ingestion_agent import IngestionAgent
-from config_loader import load_config
-

 CFG = load_config()
 DATA_DIR = CFG["ingestion"]["data_dir"]
@@ -20,10 +18,11 @@ MODEL_BASE = CFG["models"]["enrich"]
 EMBEDDING_MODEL = CFG["models"]["embedding"]
 API_BASE = CFG["api"]["base_url"]
 API_VERSION = CFG["api"]["api_version"]
-MAX_WORKERS=CFG["ingestion"]["max_workers"]
-CHUNK_SIZE=CFG["ingestion"]["chunk_size"], 
-CHUNK_OVERLAP=CFG["ingestion"]["chunk_overlap"]
-EMBEDDING_BATCH_SIZE=CFG["ingestion"]["embedding_batch_size"]
+MAX_WORKERS = CFG["ingestion"]["max_workers"]
+CHUNK_SIZE = (CFG["ingestion"]["chunk_size"],)
+CHUNK_OVERLAP = CFG["ingestion"]["chunk_overlap"]
+EMBEDDING_BATCH_SIZE = CFG["ingestion"]["embedding_batch_size"]
+

 def load_documents():
    docs = []
@@ -50,27 +49,28 @@ def load_documents():

    return docs

+
 def chunk_documents(docs):
    # LangChain preserves metadata during splitting automatically
    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=CHUNK_SIZE,
-        chunk_overlap=CHUNK_OVERLAP,
-        separators=["\n\n", "\n", ". ", " ", ""]
+        chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, separators=["\n\n", "\n", ". ", " ", ""]
    )
    return text_splitter.split_documents(docs)

+
 def enrich_chunks(chunks: list) -> list:
-    
    def process_single_chunk(indexed_chunk):
        idx, chunk = indexed_chunk
        lm_index = idx % 8
-        
+
        try:
-            with dspy.context(lm=dspy.LM(f"{MODEL_BASE}:{lm_index}", api_base=API_BASE+API_VERSION)):
+            with dspy.context(
+                lm=dspy.LM(f"{MODEL_BASE}:{lm_index}", api_base=API_BASE + API_VERSION)
+            ):
                response = IngestionAgent().forward(note=chunk.page_content)
-                
+
                # This is now an object, not a string!
-                metadata = response.answer.dict() 
+                metadata = response.answer.dict()

        except Exception as e:
            print(f"⚠️ Failed for chunk {idx}: {e}")
@@ -78,7 +78,6 @@ def enrich_chunks(chunks: list) -> list:

            chunk.metadata.update(metadata)
            return chunk
-            

    enriched_results = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
@@ -92,6 +91,7 @@ def enrich_chunks(chunks: list) -> list:
    enriched_results.sort(key=lambda x: x[0])
    return [item[1] for item in enriched_results]

+
 def store_chunks_locally(chunks, db_path=DATABASE_PATH):
    embeddings_model = LocalLMEmbeddings(
        model=EMBEDDING_MODEL,
@@ -106,14 +106,17 @@ def store_chunks_locally(chunks, db_path=DATABASE_PATH):
    print(f"✅ Successfully stored in FAISS at '{db_path}'")
    return vectorstore

+
 def main():
    docs = load_documents()
-    if not docs: return
-    
+    if not docs:
+        return
+
    chunks = chunk_documents(docs)
    enriched_chunks = enrich_chunks(chunks)
    store_chunks_locally(enriched_chunks)
    print("🎉 Ingestion complete!")

+
 if __name__ == "__main__":
-    main()
+    main()