feat: 🔒Starting the refactor
This commit is contained in:
+5
-2
@@ -12,6 +12,7 @@ from tqdm import tqdm
|
||||
from config_loader import load_config
|
||||
from embedding import LocalLMEmbeddings
|
||||
from experts.ingestion_agent import IngestionAgent
|
||||
from toon_utils import save_entities_from_chunks
|
||||
|
||||
CFG = load_config()
|
||||
DATA_DIR = CFG["ingestion"]["data_dir"]
|
||||
@@ -206,7 +207,7 @@ def embed_chunks(chunks: List[Any], batch_size: int = EMBEDDING_BATCH_SIZE) -> L
|
||||
{
|
||||
"file_path": normalize_path(chunk.metadata.get("full_path", "unknown")),
|
||||
"file_name": chunk.metadata.get("source", "unknown"),
|
||||
"chunk_data": content,
|
||||
"chunk_data": chunk.page_content,
|
||||
"synopsis": "Embedding failed",
|
||||
"tags": ["error"],
|
||||
"entities": [],
|
||||
@@ -250,7 +251,7 @@ def save_to_db(chunk_dicts):
|
||||
entry["chunk_data"],
|
||||
entry["synopsis"],
|
||||
",".join(entry["tags"]), # Store as comma-separated string
|
||||
",".join(entry["entities"]), # Store as comma-separated string
|
||||
",".join(e.get("name", str(e)) if isinstance(e, dict) else str(e) for e in entry["entities"]), # Store as comma-separated string
|
||||
embedding_str,
|
||||
entry["timestamp"],
|
||||
)
|
||||
@@ -370,6 +371,8 @@ def main():
|
||||
embedded_chunks = embed_chunks(enriched_chunks)
|
||||
print(f"Embedded {len(embedded_chunks)} chunks.")
|
||||
|
||||
save_entities_from_chunks(embedded_chunks)
|
||||
|
||||
# remove existing rows from notes table that match file path
|
||||
delete_from_db(embedded_chunks)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user