From 165464820b9c9f3c09e415877a2fad6e3d33e397 Mon Sep 17 00:00:00 2001
From: Jake Pullen <jpullen@anglianwater.co.uk>
Date: Fri, 9 Aug 2024 14:23:52 +0100
Subject: [PATCH] starting to build the warehouse

---
 .gitignore                                |  1 +
 config.yaml                               |  2 +-
 docs/ERD.md                               | 58 ++++++++++++++++++++++-
 main.py                                   |  4 +-
 pipeline/__init__.py                      |  0
 dimAccounts.py => pipeline/dimAccounts.py |  0
 ingest.py => pipeline/ingest.py           |  3 ++
 raw_to_base.py => pipeline/raw_to_base.py |  0
 test.py                                   |  6 +--
 9 files changed, 67 insertions(+), 7 deletions(-)
 create mode 100644 pipeline/__init__.py
 rename dimAccounts.py => pipeline/dimAccounts.py (100%)
 rename ingest.py => pipeline/ingest.py (95%)
 rename raw_to_base.py => pipeline/raw_to_base.py (100%)

diff --git a/.gitignore b/.gitignore
index 8c96830..049d696 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ server_knowledge_cache.json
 data/*
 .venv/*
 __pycache__/*
+*/__pycache__/*
diff --git a/config.yaml b/config.yaml
index 1e77a95..8d35b75 100644
--- a/config.yaml
+++ b/config.yaml
@@ -6,7 +6,7 @@ entities:
   - transactions
   - scheduled_transactions
 base_url: https://api.ynab.com/v1/budgets
-knowledge_file: server_knowledge_cache.json
+knowledge_file: data\server_knowledge_cache.json
 primary_keys:
   accounts:
     unique_id: id
diff --git a/docs/ERD.md b/docs/ERD.md
index aa3d17f..5c18d09 100644
--- a/docs/ERD.md
+++ b/docs/ERD.md
@@ -20,5 +20,61 @@ erDiagram
         string account_type_name
     }
     
+    CATEGORIES {
+        int category_id
+        string category_name
+        string category_group_name
+        boolean hidden
+        text note
+        decimal budgeted
+        decimal activity
+        decimal balance
+        boolean deleted
+    }
+    
+    PAYEES {
+        int payee_id
+        string payee_name
+        boolean deleted
+    }
+    
+    DATES {
+        int date_id
+        string date
+        int year
+        int month
+        int day
+    }
+    
+    TRANSACTIONS {
+        int transaction_id
+        int account_id
+        int category_id
+        int payee_id
+        int date_id
+        decimal amount
+        boolean cleared
+        boolean approved
+        boolean deleted
+    }
+    
+    SCHEDULED_TRANSACTIONS {
+        int scheduled_transaction_id
+        int account_id
+        int category_id
+        int payee_id
+        int date_id
+        decimal amount
+        string frequency
+        boolean deleted
+    }
+    
     ACCOUNTS ||--o{ ACCOUNT_TYPES : "has type"
-```
\ No newline at end of file
+    TRANSACTIONS ||--o{ ACCOUNTS : "belongs to"
+    TRANSACTIONS ||--o{ CATEGORIES : "belongs to"
+    TRANSACTIONS ||--o{ PAYEES : "belongs to"
+    TRANSACTIONS ||--o{ DATES : "occurred on"
+    SCHEDULED_TRANSACTIONS ||--o{ ACCOUNTS : "belongs to"
+    SCHEDULED_TRANSACTIONS ||--o{ CATEGORIES : "belongs to"
+    SCHEDULED_TRANSACTIONS ||--o{ PAYEES : "belongs to"
+    SCHEDULED_TRANSACTIONS ||--o{ DATES : "scheduled on"
\ No newline at end of file
diff --git a/main.py b/main.py
index f6156aa..6448b39 100644
--- a/main.py
+++ b/main.py
@@ -3,8 +3,8 @@ import dotenv
 import logging
 import yaml
 
-from ingest import Ingest
-from raw_to_base import RawToBase
+from pipeline.ingest import Ingest
+from pipeline.raw_to_base import RawToBase
 
 dotenv.load_dotenv()
 
diff --git a/pipeline/__init__.py b/pipeline/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dimAccounts.py b/pipeline/dimAccounts.py
similarity index 100%
rename from dimAccounts.py
rename to pipeline/dimAccounts.py
diff --git a/ingest.py b/pipeline/ingest.py
similarity index 95%
rename from ingest.py
rename to pipeline/ingest.py
index 6756c8b..6dc6e0a 100644
--- a/ingest.py
+++ b/pipeline/ingest.py
@@ -48,6 +48,9 @@ class Ingest:
             with open(self.knowledge_file, 'r') as f:
                 knowledge_cache = json.load(f)
         except FileNotFoundError:
+            # If the file does not exist, create an empty cache
+            # also create the file so we can save to it later
+            os.makedirs(os.path.dirname(self.knowledge_file), exist_ok=True)
             knowledge_cache = {}
         
         knowledge_cache[entity] = server_knowledge
diff --git a/raw_to_base.py b/pipeline/raw_to_base.py
similarity index 100%
rename from raw_to_base.py
rename to pipeline/raw_to_base.py
diff --git a/test.py b/test.py
index f02d1b4..930a1b1 100644
--- a/test.py
+++ b/test.py
@@ -3,7 +3,7 @@ import polars as pl
 entities = ['accounts', 'categories', 'months', 'payees', 'transactions', 'scheduled_transactions']
 
 for entity in entities:
-    print(f"Processing entity: {entity}")
+   # print(f"Processing entity: {entity}")
     file_path = f'data/base/{entity}.parquet'
     # Read the parquet file into a polars DataFrame
     entity_df = pl.read_parquet(file_path)
@@ -11,5 +11,5 @@ for entity in entities:
     print(f"Schema of {entity} DataFrame:")
     print(entity_df.schema)
     # Display the first few rows of the DataFrame
-    print(f"First few rows of {entity} DataFrame:")
-    print(entity_df.head())
+   # print(f"First few rows of {entity} DataFrame:")
+   # print(entity_df.head())