From b29b738a7ad63b65801bba46ee0a3483ca40b723 Mon Sep 17 00:00:00 2001 From: Jake Pullen Date: Sat, 10 Aug 2024 09:50:37 +0100 Subject: [PATCH] ready for open source --- ReadMe.md | 20 ++++++++------------ docs/ERD.md | 10 +++------- docs/Get_Started.md | 41 +++++++++++++++++++++++++++++++++++++++++ docs/dataflow.md | 28 ++++++++++++++++++++++++++++ pipeline/dimensions.py | 1 + test.py | 29 ----------------------------- 6 files changed, 81 insertions(+), 48 deletions(-) create mode 100644 docs/Get_Started.md delete mode 100644 test.py diff --git a/ReadMe.md b/ReadMe.md index 57695d2..98b52dc 100644 --- a/ReadMe.md +++ b/ReadMe.md @@ -1,22 +1,18 @@ # Budget Management System -This project is a Budget Management System that fetches and caches budget-related data from an API. It organizes the data into various categories and handles rate limits to ensure smooth operation. +This project is a Budget Management System that fetches and caches budget-related data from an API. It organizes the data into a dimensional model and handles rate limits to ensure smooth operation. -## Project Structure - -## Setup - -1. **Set up environment variables:** - - Create a `.env` file in the root directory and add your API token and budget ID: - ``` - API_TOKEN=your_api_token - BUDGET_ID=your_budget_id - ``` +## [Get Started Guide](docs/Get_Started.md) ## Usage +This tool has been desined to be used to fetch and cache budget-related data from the YNAB API. It can be used multiple times a day or once a year or anything in between. It is designed to be flexible and easy to use. +Once it is on your local machine, you can run it by executing the `main.py` file. This will handle situations where you have not run the tool before, or where you have run it before and need to update the data. It will not duplicate any data caused by running it multiple times. + ## Contributing +Not expecting any contributions at this time. + ## License +[GPL-3.0](https://choosealicense.com/licenses/gpl-3.0/) diff --git a/docs/ERD.md b/docs/ERD.md index 5c18d09..4451527 100644 --- a/docs/ERD.md +++ b/docs/ERD.md @@ -15,11 +15,6 @@ erDiagram boolean deleted } - ACCOUNT_TYPES { - int account_type_id - string account_type_name - } - CATEGORIES { int category_id string category_name @@ -69,7 +64,6 @@ erDiagram boolean deleted } - ACCOUNTS ||--o{ ACCOUNT_TYPES : "has type" TRANSACTIONS ||--o{ ACCOUNTS : "belongs to" TRANSACTIONS ||--o{ CATEGORIES : "belongs to" TRANSACTIONS ||--o{ PAYEES : "belongs to" @@ -77,4 +71,6 @@ erDiagram SCHEDULED_TRANSACTIONS ||--o{ ACCOUNTS : "belongs to" SCHEDULED_TRANSACTIONS ||--o{ CATEGORIES : "belongs to" SCHEDULED_TRANSACTIONS ||--o{ PAYEES : "belongs to" - SCHEDULED_TRANSACTIONS ||--o{ DATES : "scheduled on" \ No newline at end of file + SCHEDULED_TRANSACTIONS ||--o{ DATES : "scheduled on" +``` + diff --git a/docs/Get_Started.md b/docs/Get_Started.md new file mode 100644 index 0000000..05c93f4 --- /dev/null +++ b/docs/Get_Started.md @@ -0,0 +1,41 @@ +# How to get started + +This document will guide you through the process of setting up the project on your local machine. It will cover the following topics: + +- [Prerequisites](#prerequisites) +- [Setting up the project](#setting-up-the-project) +- [Running the project](#running-the-project) + +## Prerequisites + +### .env file + +Create a `.env` file in the root of the project with the following content: + +```bash +API_TOKEN=your_api_token_here +BUDGET_ID=your_budget_id_here +``` + +You can follow [This Link](https://api.ynab.com/#access-token-usage:~:text=ynab.com.-,Quick%20Start,-If%20you%27re%20the) for a guide on how to get your API token +For the `BUDGET_ID`, you can get it from the URL of your budget page on the YNAB website. It is in between the `app.ynab.com/` and the `/budget/` in the URL. For example, if your URL is `https://app.ynab.com.com/your_budget_id/budget`, then your `BUDGET_ID` is `your_budget_id`. + +## setting up the project + +### Clone the repository + +```bash +git clone #link tbc +``` + +### Install dependencies + +```bash +pip install -r requirements.txt +``` + +## Running the project + +```bash +python3 main.py +``` diff --git a/docs/dataflow.md b/docs/dataflow.md index 6d3c736..37f0fb7 100644 --- a/docs/dataflow.md +++ b/docs/dataflow.md @@ -1,2 +1,30 @@ # Flow of data from source to gold +```mermaid +graph TD + A[Source Data] --> B[Raw Data/Bronze] + B --> C[Base Data/Silver] + C --> D[Data Warehouse/Gold] + B --> G[Processed Archive] +``` + +## Source + +The Source Data is hosted in a web application called [You Need A Budget](https://www.youneedabudget.com/). We pull the data from the [YNAB API](https://api.ynab.com/), using the access token method of authentication. +The data is in JSON format. + +## Raw Data/Bronze + +The Raw Data is the data as it is pulled from the YNAB API. It is stored as JSON files in the `data/raw/` directory with a folder for each entity. + +## Base Data/Silver + +The Base Data is the data after it has been cleaned and transformed. It is stored as parquet files in the `data/base/` directory with a file for each entity. + +## Data Warehouse/Gold + +The Data Warehouse is the data after it has been aggregated and transformed. It is stored as parquet files in the `data/warehouse/` directory with a file for each entity. + +## Processed Archive + +The Processed Archive is the data after it has been processed and stored in the base tables. It is the raw json files in the `data/processed/` directory with a folder for each entity and file for each load that has been processed. diff --git a/pipeline/dimensions.py b/pipeline/dimensions.py index 5753ee8..3fed4f7 100644 --- a/pipeline/dimensions.py +++ b/pipeline/dimensions.py @@ -2,6 +2,7 @@ import polars as pl import logging import os from datetime import date + class Dimensions: def __init__(self, config): self.config = config diff --git a/test.py b/test.py deleted file mode 100644 index 93929c9..0000000 --- a/test.py +++ /dev/null @@ -1,29 +0,0 @@ -import polars as pl - -#entities = ['accounts', 'categories', 'months', 'payees', 'transactions', 'scheduled_transactions'] -entities = ['scheduled_transactions'] - - -for entity in entities: - # print(f"Processing entity: {entity}") - file_path = f'data/base/{entity}.parquet' - # Read the parquet file into a polars DataFrame - entity_df = pl.read_parquet(file_path) - # Print the schema of the DataFrame - print(f"Schema of {entity} DataFrame:") - print(entity_df.schema) - # Display the first few rows of the DataFrame - print(f"First few rows of {entity} DataFrame:") - print(entity_df.head()) - -# for entity in entities: -# # print(f"Processing entity: {entity}") -# file_path = f'data/warehouse/{entity}.parquet' -# # Read the parquet file into a polars DataFrame -# entity_df = pl.read_parquet(file_path) -# # Print the schema of the DataFrame -# print(f"Schema of {entity} DataFrame:") -# print(entity_df.schema) -# # Display the first few rows of the DataFrame -# print(f"First few rows of {entity} DataFrame:") -# print(entity_df)