starting to build the warehouse

2024-08-09 14:23:52 +01:00
parent 33c9496eb0
commit 165464820b
9 changed files with 67 additions and 7 deletions
@@ -5,3 +5,4 @@ server_knowledge_cache.json
 data/*
 .venv/*
 __pycache__/*
 */__pycache__/*
@@ -6,7 +6,7 @@ entities:
  - transactions
  - scheduled_transactions
 base_url: https://api.ynab.com/v1/budgets
-knowledge_file: server_knowledge_cache.json
+knowledge_file: data\server_knowledge_cache.json
 primary_keys:
  accounts:
    unique_id: id
@@ -20,5 +20,61 @@ erDiagram
        string account_type_name
    }
    CATEGORIES {
        int category_id
        string category_name
        string category_group_name
        boolean hidden
        text note
        decimal budgeted
        decimal activity
        decimal balance
        boolean deleted
    }
    PAYEES {
        int payee_id
        string payee_name
        boolean deleted
    }
    DATES {
        int date_id
        string date
        int year
        int month
        int day
    }
    TRANSACTIONS {
        int transaction_id
        int account_id
        int category_id
        int payee_id
        int date_id
        decimal amount
        boolean cleared
        boolean approved
        boolean deleted
    }
    SCHEDULED_TRANSACTIONS {
        int scheduled_transaction_id
        int account_id
        int category_id
        int payee_id
        int date_id
        decimal amount
        string frequency
        boolean deleted
    }
    ACCOUNTS ||--o{ ACCOUNT_TYPES : "has type"
-```
+    TRANSACTIONS ||--o{ ACCOUNTS : "belongs to"
    TRANSACTIONS ||--o{ CATEGORIES : "belongs to"
    TRANSACTIONS ||--o{ PAYEES : "belongs to"
    TRANSACTIONS ||--o{ DATES : "occurred on"
    SCHEDULED_TRANSACTIONS ||--o{ ACCOUNTS : "belongs to"
    SCHEDULED_TRANSACTIONS ||--o{ CATEGORIES : "belongs to"
    SCHEDULED_TRANSACTIONS ||--o{ PAYEES : "belongs to"
    SCHEDULED_TRANSACTIONS ||--o{ DATES : "scheduled on"
@@ -3,8 +3,8 @@ import dotenv
 import logging
 import yaml
-from ingest import Ingest
+from pipeline.ingest import Ingest
-from raw_to_base import RawToBase
+from pipeline.raw_to_base import RawToBase
 dotenv.load_dotenv()
@@ -48,6 +48,9 @@ class Ingest:
            with open(self.knowledge_file, 'r') as f:
                knowledge_cache = json.load(f)
        except FileNotFoundError:
            # If the file does not exist, create an empty cache
            # also create the file so we can save to it later
            os.makedirs(os.path.dirname(self.knowledge_file), exist_ok=True)
            knowledge_cache = {}
        knowledge_cache[entity] = server_knowledge
@@ -3,7 +3,7 @@ import polars as pl
 entities = ['accounts', 'categories', 'months', 'payees', 'transactions', 'scheduled_transactions']
 for entity in entities:
-    print(f"Processing entity: {entity}")
+   # print(f"Processing entity: {entity}")
    file_path = f'data/base/{entity}.parquet'
    # Read the parquet file into a polars DataFrame
    entity_df = pl.read_parquet(file_path)
@@ -11,5 +11,5 @@ for entity in entities:
    print(f"Schema of {entity} DataFrame:")
    print(entity_df.schema)
    # Display the first few rows of the DataFrame
-    print(f"First few rows of {entity} DataFrame:")
+   # print(f"First few rows of {entity} DataFrame:")
-    print(entity_df.head())
+   # print(entity_df.head())