base working fine i think
This commit is contained in:
+22
@@ -0,0 +1,22 @@
|
|||||||
|
entities:
|
||||||
|
- accounts
|
||||||
|
- categories
|
||||||
|
- months
|
||||||
|
- payees
|
||||||
|
- transactions
|
||||||
|
- scheduled_transactions
|
||||||
|
base_url: https://api.ynab.com/v1/budgets
|
||||||
|
knowledge_file: server_knowledge_cache.json
|
||||||
|
primary_keys:
|
||||||
|
accounts:
|
||||||
|
unique_id: id
|
||||||
|
categories:
|
||||||
|
unique_id: id
|
||||||
|
months:
|
||||||
|
unique_id: month
|
||||||
|
payees:
|
||||||
|
unique_id: id
|
||||||
|
transactions:
|
||||||
|
unique_id: id
|
||||||
|
scheduled_transactions:
|
||||||
|
unique_id: id
|
||||||
+24
@@ -0,0 +1,24 @@
|
|||||||
|
# ERD for the Finance DataWarehouse
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
erDiagram
|
||||||
|
ACCOUNTS {
|
||||||
|
int account_id
|
||||||
|
string account_name
|
||||||
|
string account_type
|
||||||
|
boolean on_budget
|
||||||
|
boolean closed
|
||||||
|
text note
|
||||||
|
decimal balance
|
||||||
|
decimal cleared_balance
|
||||||
|
decimal uncleared_balance
|
||||||
|
boolean deleted
|
||||||
|
}
|
||||||
|
|
||||||
|
ACCOUNT_TYPES {
|
||||||
|
int account_type_id
|
||||||
|
string account_type_name
|
||||||
|
}
|
||||||
|
|
||||||
|
ACCOUNTS ||--o{ ACCOUNT_TYPES : "has type"
|
||||||
|
```
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
# Flow of data from source to gold
|
||||||
|
|
||||||
@@ -75,6 +75,11 @@ class Ingest:
|
|||||||
Fetch and cache data for all entities.
|
Fetch and cache data for all entities.
|
||||||
"""
|
"""
|
||||||
for entity in self.entities:
|
for entity in self.entities:
|
||||||
|
# if we already have files in the raw data folder, we need to skip that entity
|
||||||
|
file_path = f'data/raw/{entity}'
|
||||||
|
if os.path.exists(file_path) and os.listdir(file_path):
|
||||||
|
logging.warning(f"Skipping entity: {entity} as the raw data folder is not empty.")
|
||||||
|
continue
|
||||||
last_knowledge = self.knowledge_cache.get(entity, 0)
|
last_knowledge = self.knowledge_cache.get(entity, 0)
|
||||||
logging.debug(f'Last Knowledge of {entity.capitalize()}: {last_knowledge}')
|
logging.debug(f'Last Knowledge of {entity.capitalize()}: {last_knowledge}')
|
||||||
url = f'{self.base_url}/{self.budget_id}/{entity}'
|
url = f'{self.base_url}/{self.budget_id}/{entity}'
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
import os
|
import os
|
||||||
import dotenv
|
import dotenv
|
||||||
import logging
|
import logging
|
||||||
|
import yaml
|
||||||
|
|
||||||
from ingest import Ingest
|
from ingest import Ingest
|
||||||
from raw_to_base import RawToBase
|
from raw_to_base import RawToBase
|
||||||
|
|
||||||
@@ -10,15 +12,11 @@ API_TOKEN = os.getenv('API_TOKEN')
|
|||||||
BUDGET_ID = os.getenv('BUDGET_ID')
|
BUDGET_ID = os.getenv('BUDGET_ID')
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
entities = ['accounts', 'categories', 'months', 'payees', 'transactions', 'scheduled_transactions']
|
with open('config.yaml', 'r') as file:
|
||||||
ingest_info = {}
|
config = yaml.safe_load(file)
|
||||||
|
|
||||||
ingest_info['entities'] = entities
|
config['API_TOKEN'] = API_TOKEN
|
||||||
ingest_info['base_url'] = 'https://api.ynab.com/v1/budgets'
|
config['BUDGET_ID'] = BUDGET_ID
|
||||||
ingest_info['knowledge_file'] = 'server_knowledge_cache.json'
|
|
||||||
ingest_info['API_TOKEN'] = API_TOKEN
|
|
||||||
ingest_info['BUDGET_ID'] = BUDGET_ID
|
|
||||||
|
|
||||||
|
Ingest(config)
|
||||||
Ingest(ingest_info)
|
RawToBase(config)
|
||||||
RawToBase(entities)
|
|
||||||
+14
-13
@@ -6,19 +6,12 @@ from typing import List, Dict, Any
|
|||||||
import polars as pl
|
import polars as pl
|
||||||
|
|
||||||
class RawToBase:
|
class RawToBase:
|
||||||
def __init__(self, entities: List[str]):
|
def __init__(self, config: Dict[str, Any]):
|
||||||
self.entities = entities
|
self.entities = config['entities']
|
||||||
self.config = {
|
self.primary_keys = config['primary_keys']
|
||||||
'accounts': {'unique_id': 'id'},
|
|
||||||
'categories': {'unique_id': 'id'},
|
|
||||||
'months': {'unique_id': 'month'},
|
|
||||||
'payees': {'unique_id': 'id'},
|
|
||||||
'transactions': {'unique_id': 'id'},
|
|
||||||
'scheduled_transactions': {'unique_id': 'id'}
|
|
||||||
}
|
|
||||||
self.raw_data_path = 'data/raw'
|
self.raw_data_path = 'data/raw'
|
||||||
self.base_data_path = 'data/base'
|
|
||||||
self.processed_data_path = 'data/processed'
|
self.processed_data_path = 'data/processed'
|
||||||
|
self.base_data_path = 'data/base'
|
||||||
self.data = {}
|
self.data = {}
|
||||||
self.base_data = {}
|
self.base_data = {}
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
@@ -55,6 +48,14 @@ class RawToBase:
|
|||||||
with open(file_path, 'r') as f:
|
with open(file_path, 'r') as f:
|
||||||
data = json.load(f)
|
data = json.load(f)
|
||||||
# Check if the data is empty
|
# Check if the data is empty
|
||||||
|
if entity == "categories":
|
||||||
|
# Check if any category group has categories
|
||||||
|
has_categories = any(group.get("categories") for group in data.get("category_groups", []))
|
||||||
|
if not has_categories:
|
||||||
|
logging.warning(f"Received empty data for entity: {entity} in file: {file_path}, deleting file.")
|
||||||
|
os.remove(file_path)
|
||||||
|
return False
|
||||||
|
else:
|
||||||
if not data.get(entity, []):
|
if not data.get(entity, []):
|
||||||
logging.warning(f"Received empty data for entity: {entity} in file: {file_path}, deleting file.")
|
logging.warning(f"Received empty data for entity: {entity} in file: {file_path}, deleting file.")
|
||||||
# delete the file as it is empty
|
# delete the file as it is empty
|
||||||
@@ -105,7 +106,7 @@ class RawToBase:
|
|||||||
#print(new_data_df)
|
#print(new_data_df)
|
||||||
|
|
||||||
# Ensure the unique id column is preserved
|
# Ensure the unique id column is preserved
|
||||||
unique_id = self.config[entity]['unique_id']
|
unique_id = self.primary_keys[entity]['unique_id']
|
||||||
if unique_id not in new_data_df.columns:
|
if unique_id not in new_data_df.columns:
|
||||||
logging.error(f"Unique ID column '{unique_id}' not found in the combined data for entity: {entity}")
|
logging.error(f"Unique ID column '{unique_id}' not found in the combined data for entity: {entity}")
|
||||||
exit(1)
|
exit(1)
|
||||||
@@ -115,7 +116,7 @@ class RawToBase:
|
|||||||
|
|
||||||
def _resolve_duplicates(self, entity):
|
def _resolve_duplicates(self, entity):
|
||||||
logging.debug(f"Resolving duplicates for entity: {entity}")
|
logging.debug(f"Resolving duplicates for entity: {entity}")
|
||||||
unique_id = self.config[entity]['unique_id']
|
unique_id = self.primary_keys[entity]['unique_id']
|
||||||
self.base_data[entity] = self.base_data[entity].sort(by='ingestion_date').unique(subset=unique_id, keep='first')
|
self.base_data[entity] = self.base_data[entity].sort(by='ingestion_date').unique(subset=unique_id, keep='first')
|
||||||
logging.debug(f"Successfully resolved duplicates for entity: {entity}")
|
logging.debug(f"Successfully resolved duplicates for entity: {entity}")
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
python-dotenv
|
python-dotenv
|
||||||
polars
|
polars
|
||||||
requests
|
requests
|
||||||
|
pyyaml
|
||||||
Reference in New Issue
Block a user