feat: 🔗 Tidy
This commit is contained in:
+16
-10
@@ -15,6 +15,15 @@ from config_loader import load_config
|
||||
|
||||
CFG = load_config()
|
||||
DATA_DIR = CFG["ingestion"]["data_dir"]
|
||||
DATABASE_PATH = CFG["ingestion"]["db_path"]
|
||||
MODEL_BASE = CFG["models"]["enrich"]
|
||||
EMBEDDING_MODEL = CFG["models"]["embedding"]
|
||||
API_BASE = CFG["api"]["base_url"]
|
||||
API_VERSION = CFG["api"]["api_version"]
|
||||
MAX_WORKERS=CFG["ingestion"]["max_workers"]
|
||||
CHUNK_SIZE=CFG["ingestion"]["chunk_size"],
|
||||
CHUNK_OVERLAP=CFG["ingestion"]["chunk_overlap"]
|
||||
EMBEDDING_BATCH_SIZE=CFG["ingestion"]["embedding_batch_size"]
|
||||
|
||||
def load_documents():
|
||||
docs = []
|
||||
@@ -44,16 +53,13 @@ def load_documents():
|
||||
def chunk_documents(docs):
|
||||
# LangChain preserves metadata during splitting automatically
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=CFG["ingestion"]["chunk_size"],
|
||||
chunk_overlap=CFG["ingestion"]["chunk_overlap"],
|
||||
chunk_size=CHUNK_SIZE,
|
||||
chunk_overlap=CHUNK_OVERLAP,
|
||||
separators=["\n\n", "\n", ". ", " ", ""]
|
||||
)
|
||||
return text_splitter.split_documents(docs)
|
||||
|
||||
def enrich_chunks(chunks: list) -> list:
|
||||
MODEL_BASE = CFG["models"]["inference"]
|
||||
API_BASE = CFG["api"]["base_url"]
|
||||
API_VERSION = CFG["api"]["api_version"]
|
||||
|
||||
def process_single_chunk(indexed_chunk):
|
||||
idx, chunk = indexed_chunk
|
||||
@@ -75,7 +81,7 @@ def enrich_chunks(chunks: list) -> list:
|
||||
|
||||
|
||||
enriched_results = []
|
||||
with ThreadPoolExecutor(max_workers=CFG["ingestion"]["max_workers"]) as executor:
|
||||
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
||||
# Wrap chunks in enumerate to keep track of order
|
||||
futures = [executor.submit(process_single_chunk, (i, c)) for i, c in enumerate(chunks)]
|
||||
|
||||
@@ -86,11 +92,11 @@ def enrich_chunks(chunks: list) -> list:
|
||||
enriched_results.sort(key=lambda x: x[0])
|
||||
return [item[1] for item in enriched_results]
|
||||
|
||||
def store_chunks_locally(chunks, db_path="./local_faiss_db"):
|
||||
def store_chunks_locally(chunks, db_path=DATABASE_PATH):
|
||||
embeddings_model = LocalLMEmbeddings(
|
||||
model="text-embedding-qwen3-embedding-8b",
|
||||
base_url="http://192.168.0.49:1234",
|
||||
batch_size=32,
|
||||
model=EMBEDDING_MODEL,
|
||||
base_url=API_BASE,
|
||||
batch_size=EMBEDDING_BATCH_SIZE,
|
||||
)
|
||||
|
||||
print(f"Index creation started for {len(chunks)} chunks...")
|
||||
|
||||
Reference in New Issue
Block a user