chore: 🧹 removing clutter
This commit is contained in:
+19
-25
@@ -10,8 +10,11 @@ from tqdm import tqdm
|
||||
|
||||
from embedding import LocalLMEmbeddings
|
||||
from experts.ingestion_agent import IngestionAgent
|
||||
from config_loader import load_config
|
||||
|
||||
DATA_DIR = "/home/cosmic/DnD"
|
||||
|
||||
CFG = load_config()
|
||||
DATA_DIR = CFG["ingestion"]["data_dir"]
|
||||
|
||||
def load_documents():
|
||||
docs = []
|
||||
@@ -41,47 +44,38 @@ def load_documents():
|
||||
def chunk_documents(docs):
|
||||
# LangChain preserves metadata during splitting automatically
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=800,
|
||||
chunk_overlap=100,
|
||||
chunk_size=CFG["ingestion"]["chunk_size"],
|
||||
chunk_overlap=CFG["ingestion"]["chunk_overlap"],
|
||||
separators=["\n\n", "\n", ". ", " ", ""]
|
||||
)
|
||||
return text_splitter.split_documents(docs)
|
||||
|
||||
def enrich_chunks(chunks: list) -> list:
|
||||
MODEL_BASE = "lm_studio/qwen/qwen3-8b"
|
||||
API_BASE = "http://192.168.0.49:1234/v1/"
|
||||
MODEL_BASE = CFG["models"]["inference"]
|
||||
API_BASE = CFG["api"]["base_url"]
|
||||
API_VERSION = CFG["api"]["api_version"]
|
||||
|
||||
def process_single_chunk(indexed_chunk):
|
||||
idx, chunk = indexed_chunk
|
||||
lm_index = idx % 8
|
||||
|
||||
try:
|
||||
# Configure context for this specific thread
|
||||
with dspy.context(lm=dspy.LM(f"{MODEL_BASE}:{lm_index}", api_base=API_BASE)):
|
||||
# Pass the text, but we will update the original chunk object
|
||||
response = IngestionAgent().ingest(note=chunk.page_content)
|
||||
with dspy.context(lm=dspy.LM(f"{MODEL_BASE}:{lm_index}", api_base=API_BASE+API_VERSION)):
|
||||
response = IngestionAgent().forward(note=chunk.page_content)
|
||||
|
||||
answer = response.answer
|
||||
start = answer.find("{")
|
||||
end = answer.rfind("}") + 1
|
||||
metadata_extracted = json.loads(answer[start:end])
|
||||
|
||||
# UPDATE: Put AI data in a sub-key to avoid overwriting 'source'
|
||||
chunk.metadata["enrichment"] = metadata_extracted
|
||||
# Also flatten tags for easier searching if needed
|
||||
if "tags" in metadata_extracted:
|
||||
chunk.metadata["tags"] = metadata_extracted["tags"]
|
||||
# This is now an object, not a string!
|
||||
metadata = response.answer.dict()
|
||||
|
||||
except Exception as e:
|
||||
# If enrichment fails, we KEEP the chunk but flag the error
|
||||
# This ensures 'source' and 'full_path' are NEVER lost
|
||||
chunk.metadata["enrichment_error"] = str(e)
|
||||
chunk.metadata["tags"] = ["error"]
|
||||
print(f"⚠️ Failed for chunk {idx}: {e}")
|
||||
metadata = {"synopsis": "Summary failed", "tags": ["error"], "entities": []}
|
||||
|
||||
return idx, chunk
|
||||
chunk.metadata.update(metadata)
|
||||
return chunk
|
||||
|
||||
|
||||
enriched_results = []
|
||||
with ThreadPoolExecutor(max_workers=8) as executor:
|
||||
with ThreadPoolExecutor(max_workers=CFG["ingestion"]["max_workers"]) as executor:
|
||||
# Wrap chunks in enumerate to keep track of order
|
||||
futures = [executor.submit(process_single_chunk, (i, c)) for i, c in enumerate(chunks)]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user