feat: 🔗 Tidy

2026-01-28 12:36:47 +00:00
parent d5f8d72e46
commit 8ca23187e3
4 changed files with 41 additions and 15 deletions
@@ -15,6 +15,15 @@ from config_loader import load_config

 CFG = load_config()
 DATA_DIR = CFG["ingestion"]["data_dir"]
+DATABASE_PATH = CFG["ingestion"]["db_path"]
+MODEL_BASE = CFG["models"]["enrich"]
+EMBEDDING_MODEL = CFG["models"]["embedding"]
+API_BASE = CFG["api"]["base_url"]
+API_VERSION = CFG["api"]["api_version"]
+MAX_WORKERS=CFG["ingestion"]["max_workers"]
+CHUNK_SIZE=CFG["ingestion"]["chunk_size"], 
+CHUNK_OVERLAP=CFG["ingestion"]["chunk_overlap"]
+EMBEDDING_BATCH_SIZE=CFG["ingestion"]["embedding_batch_size"]

 def load_documents():
    docs = []
@@ -44,16 +53,13 @@ def load_documents():
 def chunk_documents(docs):
    # LangChain preserves metadata during splitting automatically
    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=CFG["ingestion"]["chunk_size"], 
-        chunk_overlap=CFG["ingestion"]["chunk_overlap"], 
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    return text_splitter.split_documents(docs)

 def enrich_chunks(chunks: list) -> list:
-    MODEL_BASE = CFG["models"]["inference"]
-    API_BASE = CFG["api"]["base_url"]
-    API_VERSION = CFG["api"]["api_version"]
    
    def process_single_chunk(indexed_chunk):
        idx, chunk = indexed_chunk
@@ -75,7 +81,7 @@ def enrich_chunks(chunks: list) -> list:
            

    enriched_results = []
-    with ThreadPoolExecutor(max_workers=CFG["ingestion"]["max_workers"]) as executor:
+    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Wrap chunks in enumerate to keep track of order
        futures = [executor.submit(process_single_chunk, (i, c)) for i, c in enumerate(chunks)]

@@ -86,11 +92,11 @@ def enrich_chunks(chunks: list) -> list:
    enriched_results.sort(key=lambda x: x[0])
    return [item[1] for item in enriched_results]

-def store_chunks_locally(chunks, db_path="./local_faiss_db"):
+def store_chunks_locally(chunks, db_path=DATABASE_PATH):
    embeddings_model = LocalLMEmbeddings(
-        model="text-embedding-qwen3-embedding-8b",
-        base_url="http://192.168.0.49:1234",
-        batch_size=32,
+        model=EMBEDDING_MODEL,
+        base_url=API_BASE,
+        batch_size=EMBEDDING_BATCH_SIZE,
    )

    print(f"Index creation started for {len(chunks)} chunks...")