From 165464820b9c9f3c09e415877a2fad6e3d33e397 Mon Sep 17 00:00:00 2001 From: Jake Pullen Date: Fri, 9 Aug 2024 14:23:52 +0100 Subject: [PATCH] starting to build the warehouse --- .gitignore | 1 + config.yaml | 2 +- docs/ERD.md | 58 ++++++++++++++++++++++- main.py | 4 +- pipeline/__init__.py | 0 dimAccounts.py => pipeline/dimAccounts.py | 0 ingest.py => pipeline/ingest.py | 3 ++ raw_to_base.py => pipeline/raw_to_base.py | 0 test.py | 6 +-- 9 files changed, 67 insertions(+), 7 deletions(-) create mode 100644 pipeline/__init__.py rename dimAccounts.py => pipeline/dimAccounts.py (100%) rename ingest.py => pipeline/ingest.py (95%) rename raw_to_base.py => pipeline/raw_to_base.py (100%) diff --git a/.gitignore b/.gitignore index 8c96830..049d696 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ server_knowledge_cache.json data/* .venv/* __pycache__/* +*/__pycache__/* diff --git a/config.yaml b/config.yaml index 1e77a95..8d35b75 100644 --- a/config.yaml +++ b/config.yaml @@ -6,7 +6,7 @@ entities: - transactions - scheduled_transactions base_url: https://api.ynab.com/v1/budgets -knowledge_file: server_knowledge_cache.json +knowledge_file: data\server_knowledge_cache.json primary_keys: accounts: unique_id: id diff --git a/docs/ERD.md b/docs/ERD.md index aa3d17f..5c18d09 100644 --- a/docs/ERD.md +++ b/docs/ERD.md @@ -20,5 +20,61 @@ erDiagram string account_type_name } + CATEGORIES { + int category_id + string category_name + string category_group_name + boolean hidden + text note + decimal budgeted + decimal activity + decimal balance + boolean deleted + } + + PAYEES { + int payee_id + string payee_name + boolean deleted + } + + DATES { + int date_id + string date + int year + int month + int day + } + + TRANSACTIONS { + int transaction_id + int account_id + int category_id + int payee_id + int date_id + decimal amount + boolean cleared + boolean approved + boolean deleted + } + + SCHEDULED_TRANSACTIONS { + int scheduled_transaction_id + int account_id + int category_id + int payee_id + int date_id + decimal amount + string frequency + boolean deleted + } + ACCOUNTS ||--o{ ACCOUNT_TYPES : "has type" -``` \ No newline at end of file + TRANSACTIONS ||--o{ ACCOUNTS : "belongs to" + TRANSACTIONS ||--o{ CATEGORIES : "belongs to" + TRANSACTIONS ||--o{ PAYEES : "belongs to" + TRANSACTIONS ||--o{ DATES : "occurred on" + SCHEDULED_TRANSACTIONS ||--o{ ACCOUNTS : "belongs to" + SCHEDULED_TRANSACTIONS ||--o{ CATEGORIES : "belongs to" + SCHEDULED_TRANSACTIONS ||--o{ PAYEES : "belongs to" + SCHEDULED_TRANSACTIONS ||--o{ DATES : "scheduled on" \ No newline at end of file diff --git a/main.py b/main.py index f6156aa..6448b39 100644 --- a/main.py +++ b/main.py @@ -3,8 +3,8 @@ import dotenv import logging import yaml -from ingest import Ingest -from raw_to_base import RawToBase +from pipeline.ingest import Ingest +from pipeline.raw_to_base import RawToBase dotenv.load_dotenv() diff --git a/pipeline/__init__.py b/pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dimAccounts.py b/pipeline/dimAccounts.py similarity index 100% rename from dimAccounts.py rename to pipeline/dimAccounts.py diff --git a/ingest.py b/pipeline/ingest.py similarity index 95% rename from ingest.py rename to pipeline/ingest.py index 6756c8b..6dc6e0a 100644 --- a/ingest.py +++ b/pipeline/ingest.py @@ -48,6 +48,9 @@ class Ingest: with open(self.knowledge_file, 'r') as f: knowledge_cache = json.load(f) except FileNotFoundError: + # If the file does not exist, create an empty cache + # also create the file so we can save to it later + os.makedirs(os.path.dirname(self.knowledge_file), exist_ok=True) knowledge_cache = {} knowledge_cache[entity] = server_knowledge diff --git a/raw_to_base.py b/pipeline/raw_to_base.py similarity index 100% rename from raw_to_base.py rename to pipeline/raw_to_base.py diff --git a/test.py b/test.py index f02d1b4..930a1b1 100644 --- a/test.py +++ b/test.py @@ -3,7 +3,7 @@ import polars as pl entities = ['accounts', 'categories', 'months', 'payees', 'transactions', 'scheduled_transactions'] for entity in entities: - print(f"Processing entity: {entity}") + # print(f"Processing entity: {entity}") file_path = f'data/base/{entity}.parquet' # Read the parquet file into a polars DataFrame entity_df = pl.read_parquet(file_path) @@ -11,5 +11,5 @@ for entity in entities: print(f"Schema of {entity} DataFrame:") print(entity_df.schema) # Display the first few rows of the DataFrame - print(f"First few rows of {entity} DataFrame:") - print(entity_df.head()) + # print(f"First few rows of {entity} DataFrame:") + # print(entity_df.head())